Advertisement
aiThanet

DegreesOfSeparationDataset

Feb 2nd, 2021
2,502
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 1.83 KB | None | 0 0
  1. package com.sundogsoftware.spark
  2.  
  3. import org.apache.log4j._
  4. import org.apache.spark.sql.SparkSession
  5. import org.apache.spark.sql.functions._
  6. import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
  7.  
  8. /** Find the superhero with the most co-appearances. */
  9. object MostPopularSuperheroDataset {
  10.  
  11.   case class SuperHeroNames(id: Int, name: String)
  12.   case class SuperHero(value: String)
  13.  
  14.   /** Our main function where the action happens */
  15.   def main(args: Array[String]) {
  16.    
  17.     // Set the log level to only print errors
  18.     Logger.getLogger("org").setLevel(Level.ERROR)
  19.  
  20.     // Create a SparkSession using every core of the local machine
  21.     val spark = SparkSession
  22.       .builder
  23.       .appName("MostPopularSuperhero")
  24.       .master("local[*]")
  25.       .getOrCreate()
  26.  
  27.     // Create schema when reading Marvel-names.txt
  28.     val superHeroNamesSchema = new StructType()
  29.       .add("id", IntegerType, nullable = true)
  30.       .add("name", StringType, nullable = true)
  31.  
  32.     // Build up a hero ID -> name Dataset
  33.     import spark.implicits._
  34.     val names = spark.read
  35.       .schema(superHeroNamesSchema)
  36.       .option("sep", " ")
  37.       .csv("data/Marvel-names.txt")
  38.       .as[SuperHeroNames]
  39.  
  40.     val lines = spark.read
  41.       .text("data/Marvel-graph.txt")
  42.       .as[SuperHero]
  43.  
  44.     val connections = lines
  45.       .withColumn("id", split(col("value"), " ")(0))
  46.       .withColumn("connections", size(split(col("value"), " ")) - 1)
  47.       .groupBy("id").agg(sum("connections").alias("connections"))
  48.  
  49.     val mostPopular = connections
  50.         .sort($"connections".desc)
  51.         .first()
  52.  
  53.     val mostPopularName = names
  54.       .filter($"id" === mostPopular(0))
  55.       .select("name")
  56.       .first()
  57.  
  58.     println(s"${mostPopularName(0)} is the most popular superhero with ${mostPopular(1)} co-appearances.")
  59.   }
  60. }
  61.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement