import re
from concurrent.futures import ThreadPoolExecutor
thread_pool = ThreadPoolExecutor(4)
def load_words(filename):
'''Returns a list containing every word in `filename`.'''
word_list = []
with open(filename, 'r') as f:
for line in f:
word_list.extend(line.split(' '))
return word_list
def clean_words(words):
'''Returns a list containing only words and all lowercased.'''
clean_list = []
for word in words:
match = re.search('[a-z]+', word, re.IGNORECASE)
if match:
clean_list.append(match.group(0).lower())
return clean_list
def count_words(words):
'''Returns a dictionary mapping each word to the number of times
it appears.'''
word_count = {}
for word in words:
c = word_count.get(word, 0)
word_count[word] = c + 1
return word_count
def get_most_common(word_count, n=10):
'''Returns the `n` most common words based on the count.'''
return [i[0] for i in sorted(word_count.items(), key=lambda i: i[1], reverse=True)][:n]
def load_words_async(filename):
'''Returns a future for a list containing every word in `filename`.'''
return thread_pool.submit(load_words, filename)
def clean_words_async(words):
'''Returns a future for a list of only words and all lowercased.'''
return thread_pool.submit(clean_words, words)
def count_words_async(words):
'''Returns a future for a dictionary mapping words to the number of
times it appears.'''
return thread_pool.submit(count_words, words)
def get_most_common_async(word_count, n=10):
'''Returns the `n` most common words based on the count.'''
return thread_pool.submit(get_most_common, word_count, n)