Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import string
- from collections import Counter
- class TopCommonWords():
- """
- Class for finding Top common words in any text file
- Each word will be converted to lowercase, and stripped all punctuation, whitespaces, digits, quotes
- """
- STRIP_CHARS=string.whitespace + string.punctuation + string.digits + "\"'"
- def __init__(self, min_word_length=3, common_words_result=10):
- """
- In init will be only some params
- @min_word_length - param, minimum length of word
- @common_words_result - how
- """
- self._filename = None
- self.min_word_length = min_word_length
- self.common_words_result = common_words_result
- @property
- def filename(self):
- """return previously setted filename or asks user to input filename"""
- if not self._filename:
- filename = raw_input('Please enter filepath to file:\n')
- if os.path.isfile(filename):
- self._filename = filename
- return self._filename
- else:
- raise ValueError("Supllied path %s does not link to file. Exiting.." % filename)
- else:
- return self._filename
- def getwords(self):
- """Memory efficient iteration thought words in file"""
- for line in open(self.filename):
- for word in line.lower().split():
- word=word.strip(self.STRIP_CHARS)
- if len(word) > self.min_word_length:
- yield (word)
- def process_file(self):
- """Process file and print results """
- try:
- self.top = Counter(self.getwords())
- except ValueError as e:
- print e.message
- else:
- self.print_results()
- def print_results(self):
- """printing results after processing"""
- print "Top %s results processing %s" % (self.common_words_result, self._filename)
- for each in self.top.most_common(self.common_words_result):
- print each
- if __name__ == '__main__':
- top = TopCommonWords()
- top.process_file()
Advertisement
Add Comment
Please, Sign In to add comment