Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Mapper
- #!/usr/bin/python
- import sys
- for line in sys.stdin:
- data = line.strip().split("GET ")
- if(len(data) > 1):
- #docname = data[1].split(" ")[0]
- #docname = data[0].split("-")
- #ip = docname[0]
- #date = docname[2].split("[")[1]
- #print "{0}\t{1}".format(docname, 1)
- #print ip, date
- part2 = data[1].split(" ")
- path = part2[0]
- #protocol = part2[1].split("\"")[0]
- #status = part2[2]
- #size = part2[3]
- #if path in '/images/filmmediablock/360/Chacha.jpg' :
- from urlparse import urlparse
- parsed = urlparse(path)
- print parsed.path+parsed.query
- #Reducer
- #!/usr/bin/python
- # Write a MapReduce program which will display the number of hits for each different file on the Web site.
- import sys
- countTotal = 0
- oldKey = None
- highestTotal =0
- highestKey = None
- # Loop around the data
- # It will be in the format key\tval
- #
- for line in sys.stdin:
- thisKey = line
- if oldKey and oldKey != thisKey:
- if countTotal > highestTotal:
- highestTotal = countTotal
- highestKey = thisKey
- oldKey = thisKey;
- countTotal = 0
- oldKey = thisKey
- countTotal += 1
- print highestKey, "\t", highestTotal
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement