Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from mrjob.job import MRJob
- from mrjob.step import MRStep
- from stemming.porter2 import stem
- class MRWordCount(MRJob):
- def steps(self):
- return [
- MRStep(mapper=self.mapper,
- reducer=self.reducer),
- MRStep(mapper=self.secondmapper,
- reducer = self.secondreducer)
- ]
- def mapper(self,_,lines):
- words = lines.strip().split()
- for w in words:
- yield stem(w.lower()),1
- def reducer(self, key, values):
- yield key, (sum(values))
- def secondmapper(self, key,value):
- yield '%04d'%int(value), key
- def secondreducer(self, key, values):
- for v in values:
- yield v,key
- if __name__ == '__main__':
- MRWordCount.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement