top20words

# -*- coding: utf-8 -*-
#
#
import sys
import re
from itertools import groupby
from operator import itemgetter


class Mapper:

    re_en = re.compile(ur"[a-z]+")
    re_ru = re.compile(ur"[а-яё]+")

    def __init__(self, lang):
        if lang == "en":
            self.re = Mapper.re_en
        else:
            self.re = Mapper.re_ru
        self.results = {}

    def run(self):
        # f = open('words_in_articles.txt', 'w')
        doc_count = 0
        data = self.readInput()
        for docid, contents in data:
            text = contents.lower()
            word_count = 0
            for match in self.re.finditer(text):
                word = match.group(0)
                if word not in self.results:
                    self.results[word] = {docid: 1}
                else:
                    if docid not in self.results[word]:
                        self.results[word][docid] = 1
                    else:
                        self.results[word][docid] += 1
                word_count += 1
            sys.stderr.write("reporter:counter:MyCounters,InputWords,%d\n" % word_count)
            doc_count += 1
            if doc_count % 1000 == 0:
                self.emitResults()
                sys.stderr.write("reporter:status:Processed %d documents\n" % doc_count)
            # f.write(str(str(docid) + ' ' + str(word_count) + '\n').encode('utf-8'))
        self.emitResults()
        # f.close()

    def readInput(self):
        for line in sys.stdin:
            yield unicode(line, 'utf8').strip().split('\t', 1)

    def emitResults(self):
        for word, counts in self.results.iteritems():
            for docid, count in counts.iteritems():
                print ('%s\t%s\t%s' % (word, docid, count)).encode('utf-8')
        self.results = {}


class Reducer:

    def run(self):
        # f = open('top20words.txt', 'w')
        self.top20words = {}
        data = self.readInput()
        for word, group in groupby(data, itemgetter(0)):
            for word, docid, count in group:
                # print word, docid, count
                if word in self.top20words:
                    self.top20words[word] += int(count)
                else:
                    self.top20words[word] = int(count)
        for pair in list(sorted(self.top20words.items(), key=lambda (k, v): v))[-20:]:
            # f.write(pair[0] + '\n')
            print('%s ' % (pair[0])).encode('utf-8')

    def readInput(self):
        for line in sys.stdin:
            yield unicode(line, 'utf8').strip().split('\t', 1)


if __name__ == "__main__":
    mr_func = sys.argv[1]
    if mr_func == "map":
        lang = sys.argv[2]
        mapper = Mapper(lang)
        mapper.run()
    elif mr_func == "reduce":
        reducer = Reducer()
        reducer.run()