Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pyspark.streaming import StreamingContext
- ssc = StreamingContext(sc, 10)
- lines = ssc.socketTextStream("gw01.itversity.com", 19999)
- departmentData = lines.filter(lambda s: s.split()[6].split("/")[1] == "department")
- departmentTuples = departmentData.map(lambda s: (s.split()[6].split("/")[2], 1))
- countByDepartment = departmentTuples.reduceByKeyAndWindow(lambda x, y: x + y, 30, 10)
- #countByDepartment = departmentTuples.reduceByKey(lambda x, y: x + y)
- #countByDepartment.pprint()
- countByDepartment.saveAsTextFiles("/user/dgadiraju/streaming_count_by_department")
- ssc.start()
Add Comment
Please, Sign In to add comment