aiThanet

DegreesOfSeparationDataset

Feb 2nd, 2021
921
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package com.sundogsoftware.spark
  2.  
  3. import org.apache.log4j._
  4. import org.apache.spark.sql.SparkSession
  5. import org.apache.spark.sql.functions._
  6. import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
  7.  
  8. /** Find the superhero with the most co-appearances. */
  9. object MostPopularSuperheroDataset {
  10.  
  11.   case class SuperHeroNames(id: Int, name: String)
  12.   case class SuperHero(value: String)
  13.  
  14.   /** Our main function where the action happens */
  15.   def main(args: Array[String]) {
  16.    
  17.     // Set the log level to only print errors
  18.     Logger.getLogger("org").setLevel(Level.ERROR)
  19.  
  20.     // Create a SparkSession using every core of the local machine
  21.     val spark = SparkSession
  22.       .builder
  23.       .appName("MostPopularSuperhero")
  24.       .master("local[*]")
  25.       .getOrCreate()
  26.  
  27.     // Create schema when reading Marvel-names.txt
  28.     val superHeroNamesSchema = new StructType()
  29.       .add("id", IntegerType, nullable = true)
  30.       .add("name", StringType, nullable = true)
  31.  
  32.     // Build up a hero ID -> name Dataset
  33.     import spark.implicits._
  34.     val names = spark.read
  35.       .schema(superHeroNamesSchema)
  36.       .option("sep", " ")
  37.       .csv("data/Marvel-names.txt")
  38.       .as[SuperHeroNames]
  39.  
  40.     val lines = spark.read
  41.       .text("data/Marvel-graph.txt")
  42.       .as[SuperHero]
  43.  
  44.     val connections = lines
  45.       .withColumn("id", split(col("value"), " ")(0))
  46.       .withColumn("connections", size(split(col("value"), " ")) - 1)
  47.       .groupBy("id").agg(sum("connections").alias("connections"))
  48.  
  49.     val mostPopular = connections
  50.         .sort($"connections".desc)
  51.         .first()
  52.  
  53.     val mostPopularName = names
  54.       .filter($"id" === mostPopular(0))
  55.       .select("name")
  56.       .first()
  57.  
  58.     println(s"${mostPopularName(0)} is the most popular superhero with ${mostPopular(1)} co-appearances.")
  59.   }
  60. }
  61.  
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×