Advertisement
Guest User

reducer1.py

a guest
Mar 14th, 2015
18
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.99 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. from itertools import groupby
  4. from operator import itemgetter
  5. import sys
  6.  
  7. def read_mapper_output(file, separator='\t'):
  8.     for line in file:
  9.         yield line.rstrip().split(separator, 1)
  10.  
  11. def main(separator='\t'):
  12.     # Input comes from STDIN
  13.     data = read_mapper_output(sys.stdin, separator=separator)
  14.     # groupby groups multiple property pairs by uri,
  15.     # and creates an iterator that returns consecutive keys and their group:
  16.     #   current_uri - string containing a uri (the key)
  17.     #   group - iterator yielding all ["<current_uri>", "<prpt>"] items
  18.     for current_uri, group in groupby(data, itemgetter(0)):
  19.         try:
  20.             # Check unique properties using a set (no duplication)
  21.             propertySet = {prpt for current_uri, prpt in group}
  22.             # Output into STDOUT the URI and the number of unique properties
  23.             print "%s%s%d" % (current_uri, separator, len(propertySet))
  24.         except ValueError:
  25.             # count was not a number, so silently discard this item
  26.             pass
  27.  
  28. if __name__ == "__main__":
  29.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement