import re from concurrent.futures import ThreadPoolExecutor thread_pool = ThreadPoolExecutor(4) def load_words(filename): '''Returns a list containing every word in `filename`.''' word_list = [] with open(filename, 'r') as f: for line in f: word_list.extend(line.split(' ')) return word_list def clean_words(words): '''Returns a list containing only words and all lowercased.''' clean_list = [] for word in words: match = re.search('[a-z]+', word, re.IGNORECASE) if match: clean_list.append(match.group(0).lower()) return clean_list def count_words(words): '''Returns a dictionary mapping each word to the number of times it appears.''' word_count = {} for word in words: c = word_count.get(word, 0) word_count[word] = c + 1 return word_count def get_most_common(word_count, n=10): '''Returns the `n` most common words based on the count.''' return [i[0] for i in sorted(word_count.items(), key=lambda i: i[1], reverse=True)][:n] def load_words_async(filename): '''Returns a future for a list containing every word in `filename`.''' return thread_pool.submit(load_words, filename) def clean_words_async(words): '''Returns a future for a list of only words and all lowercased.''' return thread_pool.submit(clean_words, words) def count_words_async(words): '''Returns a future for a dictionary mapping words to the number of times it appears.''' return thread_pool.submit(count_words, words) def get_most_common_async(word_count, n=10): '''Returns the `n` most common words based on the count.''' return thread_pool.submit(get_most_common, word_count, n)