Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- #
- #
- import sys
- import re
- from itertools import groupby
- from operator import itemgetter
- class Mapper:
- re_en = re.compile(ur"[a-z]+")
- re_ru = re.compile(ur"[а-яё]+")
- def __init__(self, lang):
- if lang == "en":
- self.re = Mapper.re_en
- else:
- self.re = Mapper.re_ru
- self.results = {}
- def run(self):
- # f = open('words_in_articles.txt', 'w')
- doc_count = 0
- data = self.readInput()
- for docid, contents in data:
- text = contents.lower()
- word_count = 0
- for match in self.re.finditer(text):
- word = match.group(0)
- if word not in self.results:
- self.results[word] = {docid: 1}
- else:
- if docid not in self.results[word]:
- self.results[word][docid] = 1
- else:
- self.results[word][docid] += 1
- word_count += 1
- sys.stderr.write("reporter:counter:MyCounters,InputWords,%d\n" % word_count)
- doc_count += 1
- if doc_count % 1000 == 0:
- self.emitResults()
- sys.stderr.write("reporter:status:Processed %d documents\n" % doc_count)
- # f.write(str(str(docid) + ' ' + str(word_count) + '\n').encode('utf-8'))
- self.emitResults()
- # f.close()
- def readInput(self):
- for line in sys.stdin:
- yield unicode(line, 'utf8').strip().split('\t', 1)
- def emitResults(self):
- for word, counts in self.results.iteritems():
- for docid, count in counts.iteritems():
- print ('%s\t%s\t%s' % (word, docid, count)).encode('utf-8')
- self.results = {}
- class Reducer:
- def run(self):
- # f = open('top20words.txt', 'w')
- self.top20words = {}
- data = self.readInput()
- for word, group in groupby(data, itemgetter(0)):
- for word, docid, count in group:
- # print word, docid, count
- if word in self.top20words:
- self.top20words[word] += int(count)
- else:
- self.top20words[word] = int(count)
- for pair in list(sorted(self.top20words.items(), key=lambda (k, v): v))[-20:]:
- # f.write(pair[0] + '\n')
- print('%s ' % (pair[0])).encode('utf-8')
- def readInput(self):
- for line in sys.stdin:
- yield unicode(line, 'utf8').strip().split('\t', 1)
- if __name__ == "__main__":
- mr_func = sys.argv[1]
- if mr_func == "map":
- lang = sys.argv[2]
- mapper = Mapper(lang)
- mapper.run()
- elif mr_func == "reduce":
- reducer = Reducer()
- reducer.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement