Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from concurrent.futures import ThreadPoolExecutor
- thread_pool = ThreadPoolExecutor(4)
- def load_words(filename):
- '''Returns a list containing every word in `filename`.'''
- word_list = []
- with open(filename, 'r') as f:
- for line in f:
- word_list.extend(line.split(' '))
- return word_list
- def clean_words(words):
- '''Returns a list containing only words and all lowercased.'''
- clean_list = []
- for word in words:
- match = re.search('[a-z]+', word, re.IGNORECASE)
- if match:
- clean_list.append(match.group(0).lower())
- return clean_list
- def count_words(words):
- '''Returns a dictionary mapping each word to the number of times
- it appears.'''
- word_count = {}
- for word in words:
- c = word_count.get(word, 0)
- word_count[word] = c + 1
- return word_count
- def get_most_common(word_count, n=10):
- '''Returns the `n` most common words based on the count.'''
- return [i[0] for i in sorted(word_count.items(), key=lambda i: i[1], reverse=True)][:n]
- def load_words_async(filename):
- '''Returns a future for a list containing every word in `filename`.'''
- return thread_pool.submit(load_words, filename)
- def clean_words_async(words):
- '''Returns a future for a list of only words and all lowercased.'''
- return thread_pool.submit(clean_words, words)
- def count_words_async(words):
- '''Returns a future for a dictionary mapping words to the number of
- times it appears.'''
- return thread_pool.submit(count_words, words)
- def get_most_common_async(word_count, n=10):
- '''Returns the `n` most common words based on the count.'''
- return thread_pool.submit(get_most_common, word_count, n)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement