Advertisement
Guest User

Untitled

a guest
Apr 3rd, 2014
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 0.72 KB | None | 0 0
  1. import org.apache.spark.rdd.RDD
  2.  
  3. val rsvps: RDD[String] = sc.textFile("hdfs://hadoop.int.meetup.com:9000/facts/all_member_rsvp_dump/data")
  4. val pairs: RDD[(Int, Int)] = rsvps.map(line => (line.split("\t").toList.head.toInt, 1))
  5. val counts: RDD[(Int, Int)] = pairs.reduceByKey((count1, count2) => count1 + count2).cache()
  6.  
  7. val formatted: RDD[String] = counts.map { case (member_id, count) => member_id + "\t" + count }
  8. formatted.saveAsTextFile("hdfs://hadoop.int.meetup.com:9000/generated/member_rsvp_counts")
  9.  
  10. val histogram: RDD[(Int, Int)] = counts.
  11.     map { case (_, count) => (count, 1) }.
  12.     reduceByKey((c1, c2) => c1 + c2)
  13.  
  14. histogram.saveAsTextFile("hdfs://hadoop.int.meetup.com:9000/generated/member_rsvp_histogram")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement