Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from operator import itemgetter
- import sys
- # maps words to their counts
- word2count = {}
- # input comes from STDIN
- for line in sys.stdin:
- # remove leading and trailing whitespace
- line = line.strip()
- # parse the input we got from mapper.py
- word, count = line.split('\t', 1)
- # convert count (currently a string) to int
- try:
- count = int(count)
- word2count[word] = word2count.get(word, 0) + count
- except ValueError:
- # count was not a number, so silently
- # ignore/discard this line
- pass
- # sort the words lexigraphically;
- #
- # this step is NOT required, we just do it so that our
- # final output will look more like the official Hadoop
- # word count examples
- #sort by value, not key because I need it that way
- sorted_word2count = sorted(word2count.items(), key=itemgetter(1), reverse=True)
- # write the results to STDOUT (standard output)
- for word, count in sorted_word2count:
- print '%s\t%s'% (word, count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement