Advertisement
Guest User

top20words

a guest
May 3rd, 2016
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.80 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. #
  3. #
  4. import sys
  5. import re
  6. from itertools import groupby
  7. from operator import itemgetter
  8.  
  9.  
  10. class Mapper:
  11.  
  12.     re_en = re.compile(ur"[a-z]+")
  13.     re_ru = re.compile(ur"[а-яё]+")
  14.  
  15.     def __init__(self, lang):
  16.         if lang == "en":
  17.             self.re = Mapper.re_en
  18.         else:
  19.             self.re = Mapper.re_ru
  20.         self.results = {}
  21.  
  22.     def run(self):
  23.         # f = open('words_in_articles.txt', 'w')
  24.         doc_count = 0
  25.         data = self.readInput()
  26.         for docid, contents in data:
  27.             text = contents.lower()
  28.             word_count = 0
  29.             for match in self.re.finditer(text):
  30.                 word = match.group(0)
  31.                 if word not in self.results:
  32.                     self.results[word] = {docid: 1}
  33.                 else:
  34.                     if docid not in self.results[word]:
  35.                         self.results[word][docid] = 1
  36.                     else:
  37.                         self.results[word][docid] += 1
  38.                 word_count += 1
  39.             sys.stderr.write("reporter:counter:MyCounters,InputWords,%d\n" % word_count)
  40.             doc_count += 1
  41.             if doc_count % 1000 == 0:
  42.                 self.emitResults()
  43.                 sys.stderr.write("reporter:status:Processed %d documents\n" % doc_count)
  44.             # f.write(str(str(docid) + ' ' + str(word_count) + '\n').encode('utf-8'))
  45.         self.emitResults()
  46.         # f.close()
  47.  
  48.     def readInput(self):
  49.         for line in sys.stdin:
  50.             yield unicode(line, 'utf8').strip().split('\t', 1)
  51.  
  52.     def emitResults(self):
  53.         for word, counts in self.results.iteritems():
  54.             for docid, count in counts.iteritems():
  55.                 print ('%s\t%s\t%s' % (word, docid, count)).encode('utf-8')
  56.         self.results = {}
  57.  
  58.  
  59. class Reducer:
  60.  
  61.     def run(self):
  62.         # f = open('top20words.txt', 'w')
  63.         self.top20words = {}
  64.         data = self.readInput()
  65.         for word, group in groupby(data, itemgetter(0)):
  66.             for word, docid, count in group:
  67.                 # print word, docid, count
  68.                 if word in self.top20words:
  69.                     self.top20words[word] += int(count)
  70.                 else:
  71.                     self.top20words[word] = int(count)
  72.         for pair in list(sorted(self.top20words.items(), key=lambda (k, v): v))[-20:]:
  73.             # f.write(pair[0] + '\n')
  74.             print('%s ' % (pair[0])).encode('utf-8')
  75.  
  76.     def readInput(self):
  77.         for line in sys.stdin:
  78.             yield unicode(line, 'utf8').strip().split('\t', 1)
  79.  
  80.  
  81. if __name__ == "__main__":
  82.     mr_func = sys.argv[1]
  83.     if mr_func == "map":
  84.         lang = sys.argv[2]
  85.         mapper = Mapper(lang)
  86.         mapper.run()
  87.     elif mr_func == "reduce":
  88.         reducer = Reducer()
  89.         reducer.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement