Advertisement
Guest User

mapper1.py

a guest
Mar 14th, 2015
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.09 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. import sys
  5.  
  6. # Match the first <item> of the line
  7. subjectRegEx = re.compile("^<([^>]+)>")
  8. # Match the second <item> of the line
  9. propertyRegEx = re.compile("^<[^>]+>\s<([^>]+)>")
  10.  
  11. def read_input(file):
  12.     for line in file:
  13.         # Remove leading and trailing whitespace
  14.         line = line.strip()
  15.         # Split the line according to a regex
  16.         # capture the content of the first <item>
  17.         subject = subjectRegEx.match(line)
  18.         if(subject != None):
  19.             # capture the content of the second <item>
  20.             propertyValue = propertyRegEx.match(line)
  21.             prpt = propertyValue.group(1) if (propertyValue != None) else None
  22.             yield (subject.group(1), prpt)
  23.         else:
  24.             yield (None, None)
  25.  
  26. def main(separator='\t'):
  27.     # Input comes from STDIN
  28.     data = read_input(sys.stdin)
  29.     for uri in data:
  30.         # Checks if there is a match
  31.         if (uri != None):
  32.             # write the result to STDOUT
  33.             # what we output here will be the input for the
  34.             # Reduce step, i.e. the input for reducer.py
  35.             #
  36.             # tab-delimited;
  37.             print '%s%s%s' % (uri[0], separator, uri[1])
  38.  
  39. if __name__ == "__main__":
  40.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement