Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- import sys
- # Match the first <item> of the line
- subjectRegEx = re.compile("^<([^>]+)>")
- # Match the second <item> of the line
- propertyRegEx = re.compile("^<[^>]+>\s<([^>]+)>")
- def read_input(file):
- for line in file:
- # Remove leading and trailing whitespace
- line = line.strip()
- # Split the line according to a regex
- # capture the content of the first <item>
- subject = subjectRegEx.match(line)
- if(subject != None):
- # capture the content of the second <item>
- propertyValue = propertyRegEx.match(line)
- prpt = propertyValue.group(1) if (propertyValue != None) else None
- yield (subject.group(1), prpt)
- else:
- yield (None, None)
- def main(separator='\t'):
- # Input comes from STDIN
- data = read_input(sys.stdin)
- for uri in data:
- # Checks if there is a match
- if (uri != None):
- # write the result to STDOUT
- # what we output here will be the input for the
- # Reduce step, i.e. the input for reducer.py
- #
- # tab-delimited;
- print '%s%s%s' % (uri[0], separator, uri[1])
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement