Untitled

textFile = spark.read.text('small.log')
textFile.count()
textFile.filter(textFile.value.contains('bob')).count()
textFile.filter(textFile.value.contains('alice')).count()
textFile.filter(textFile.value.contains('alice2')).count()
bob_rows = textFile.filter(textFile.value.contains('bob'))
from pyspark.sql import Row
bob_rows = bob_rows.rdd.flatMap(lambda x: Row(x))
bob_times = bob_rows.rdd.map(lambda row: row.split('\t')[2])
bob_times