Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def main(args: Array[String]): Unit ={
- val conf = new SparkConf().setAppName("Simple Application").setMaster("local")
- val sc = new SparkContext(conf)
- /*
- val source = sc.textFile("szekspir")
- val counts = source.map(line => line.split(" ").size)
- .reduce((a,b) => a+b)
- println(counts)
- */
- /*val m =1000
- val random = Random
- val groups = Map((for (i <- 0 to m) yield {
- val mu = random.nextDouble()*10-5
- val std = random.nextDouble()*10-5
- (i->(mu,std))}) : _*)
- val n=1000000
- val data = for (i <- 1 to n) yield
- {val g = random.nextInt(m)
- ;
- val()
- }*/
- /* val groups = Map(
- 0 -> (0, 1),
- 1 -> (1, 1),
- 2 -> (2, 2),
- 3 -> (3, 3),
- 4 -> (4, 3),
- 5 -> (5, 2),
- 6 -> (6, 1)
- )
- val n = 1000000
- val random = Random
- val data = sc.parallelize(
- for (i <- 1 to n) yield {
- val g = random.nextInt(7)
- val (mu,sigma) = groups(g)
- (g, mu + sigma * random.nextGaussian())
- }
- )
- println("------------------ DATA : ----------------")
- data.take(20).foreach(println)
- println("---------------- END OF DATA ---------------")
- val records = (data
- .map(t => (t._1, (1, t._2, t._2 * t._2)))
- .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3))
- )
- records.cache()
- records.take(10).foreach(println)
- // Aggregate in format (Group ID, (Sum count this group in all records, Average value, Variance))
- val groups_aggregated_stats = records.map(r => (r._1, (r._2._1, r._2._2 / r._2._1, ((r._2._3 - ((r._2._2 * r._2._2) / r._2._1)) / r._2._1))))
- // Print in new line each group stats
- groups_aggregated_stats.collect().foreach(println)
- // Aggregate results in all groups to single group "all records"
- val all_records_stats = records.reduce((a, b) => (0, (a._2._1 + b._2._1, a._2._2 + b._2._2, a._2._3 + b._2._3)))
- val results = (all_records_stats._2._1, all_records_stats._2._2 / all_records_stats._2._1, ((all_records_stats._2._3 - ((all_records_stats._2._2 * all_records_stats._2._2) / all_records_stats._2._1)) / all_records_stats._2._1))
- // Return result as (Total count, Total average, Total variance)
- println(results)*/
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement