Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.spark.rdd.RDD
- val rsvps: RDD[String] = sc.textFile("hdfs://hadoop.int.meetup.com:9000/facts/all_member_rsvp_dump/data")
- val pairs: RDD[(Int, Int)] = rsvps.map(line => (line.split("\t").toList.head.toInt, 1))
- val counts: RDD[(Int, Int)] = pairs.reduceByKey((count1, count2) => count1 + count2).cache()
- val formatted: RDD[String] = counts.map { case (member_id, count) => member_id + "\t" + count }
- formatted.saveAsTextFile("hdfs://hadoop.int.meetup.com:9000/generated/member_rsvp_counts")
- val histogram: RDD[(Int, Int)] = counts.
- map { case (_, count) => (count, 1) }.
- reduceByKey((c1, c2) => c1 + c2)
- histogram.saveAsTextFile("hdfs://hadoop.int.meetup.com:9000/generated/member_rsvp_histogram")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement