Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- from itertools import groupby
- from operator import itemgetter
- import sys
- def read_mapper_output(file, separator='\t'):
- for line in file:
- yield line.rstrip().split(separator, 1)
- def main(separator='\t'):
- # Input comes from STDIN
- data = read_mapper_output(sys.stdin, separator=separator)
- # groupby groups multiple property pairs by uri,
- # and creates an iterator that returns consecutive keys and their group:
- # current_uri - string containing a uri (the key)
- # group - iterator yielding all ["<current_uri>", "<prpt>"] items
- for current_uri, group in groupby(data, itemgetter(0)):
- try:
- # Check unique properties using a set (no duplication)
- propertySet = {prpt for current_uri, prpt in group}
- # Output into STDOUT the URI and the number of unique properties
- print "%s%s%d" % (current_uri, separator, len(propertySet))
- except ValueError:
- # count was not a number, so silently discard this item
- pass
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement