Advertisement
desdemona

reducer wc

May 1st, 2016
477
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.00 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from operator import itemgetter
  4. import sys
  5.  
  6. # maps words to their counts
  7. word2count = {}
  8.  
  9. # input comes from STDIN
  10. for line in sys.stdin:
  11.     # remove leading and trailing whitespace
  12.     line = line.strip()
  13.  
  14.     # parse the input we got from mapper.py
  15.     word, count = line.split('\t', 1)
  16.     # convert count (currently a string) to int
  17.     try:
  18.         count = int(count)
  19.         word2count[word] = word2count.get(word, 0) + count
  20.     except ValueError:
  21.         # count was not a number, so silently
  22.         # ignore/discard this line
  23.         pass
  24.  
  25. # sort the words lexigraphically;
  26. #
  27. # this step is NOT required, we just do it so that our
  28. # final output will look more like the official Hadoop
  29. # word count examples
  30. #sort by value, not key because I need it that way
  31. sorted_word2count = sorted(word2count.items(), key=itemgetter(1), reverse=True)
  32.  
  33. # write the results to STDOUT (standard output)
  34. for word, count in sorted_word2count:
  35.     print '%s\t%s'% (word, count)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement