Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- textFile = spark.read.text('small.log')
- textFile.count()
- textFile.filter(textFile.value.contains('bob')).count()
- textFile.filter(textFile.value.contains('alice')).count()
- textFile.filter(textFile.value.contains('alice2')).count()
- bob_rows = textFile.filter(textFile.value.contains('bob'))
- from pyspark.sql import Row
- bob_rows = bob_rows.rdd.flatMap(lambda x: Row(x))
- bob_times = bob_rows.rdd.map(lambda row: row.split('\t')[2])
- bob_times
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement