Untitled

from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 10)
lines = ssc.socketTextStream("gw01.itversity.com", 19999)
departmentData = lines.filter(lambda s: s.split()[6].split("/")[1] == "department")
departmentTuples = departmentData.map(lambda s: (s.split()[6].split("/")[2], 1))
countByDepartment = departmentTuples.reduceByKeyAndWindow(lambda x, y: x + y, 30, 10)
#countByDepartment = departmentTuples.reduceByKey(lambda x, y: x + y)
#countByDepartment.pprint()
countByDepartment.saveAsTextFiles("/user/dgadiraju/streaming_count_by_department")
ssc.start()