pymen

TopCommonWordsInFile

Sep 13th, 2013
80
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.10 KB | None | 0 0
  1. import os
  2. import string
  3. from collections import Counter
  4.  
  5. class TopCommonWords():
  6.     """
  7.    Class for finding Top common words in any text file
  8.    Each word will be converted to lowercase, and stripped all punctuation, whitespaces, digits, quotes
  9.    """
  10.     STRIP_CHARS=string.whitespace + string.punctuation + string.digits + "\"'"
  11.  
  12.     def __init__(self, min_word_length=3, common_words_result=10):
  13.         """
  14.        In init will be only some params
  15.        @min_word_length - param, minimum length of word
  16.        @common_words_result - how
  17.        """
  18.         self._filename = None
  19.         self.min_word_length = min_word_length
  20.         self.common_words_result = common_words_result
  21.  
  22.     @property
  23.     def filename(self):
  24.         """return  previously setted filename or asks user to input filename"""
  25.         if not self._filename:
  26.             filename = raw_input('Please enter filepath to file:\n')
  27.             if os.path.isfile(filename):
  28.                 self._filename = filename
  29.                 return self._filename
  30.             else:
  31.                 raise ValueError("Supllied path %s does not link to file. Exiting.." % filename)
  32.         else:
  33.             return  self._filename
  34.  
  35.     def getwords(self):
  36.         """Memory efficient iteration thought words in file"""
  37.         for line in open(self.filename):
  38.             for word in line.lower().split():
  39.                 word=word.strip(self.STRIP_CHARS)
  40.                 if len(word) > self.min_word_length:
  41.                     yield (word)
  42.  
  43.     def process_file(self):
  44.         """Process file and print results """
  45.         try:
  46.             self.top = Counter(self.getwords())
  47.         except ValueError as e:
  48.             print e.message
  49.         else:
  50.             self.print_results()
  51.  
  52.     def print_results(self):
  53.         """printing results after processing"""
  54.  
  55.         print "Top %s results processing %s" % (self.common_words_result, self._filename)
  56.         for each in self.top.most_common(self.common_words_result):
  57.             print each
  58.  
  59.  
  60. if __name__ == '__main__':
  61.     top = TopCommonWords()
  62.     top.process_file()
Advertisement
Add Comment
Please, Sign In to add comment