Guest User

Untitled

a guest
Dec 6th, 2019
22
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.15 KB | None | 0 0
  1. import org.apache.spark.SparkContext
  2. import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SparkSession}
  3. import org.apache.spark.sql.functions.broadcast
  4.  
  5.  
  6. object Classification {
  7.  
  8. def main(args: Array[String]): Unit = {
  9.  
  10. val dataPath = "./mlboot_data.tsv" // 11 GB
  11. val testPath = "./mlboot_test.tsv" // 6MB
  12. val trainPath = "./mlboot_train_answers.tsv" // 15 MB
  13.  
  14.  
  15. val spark = SparkSession.builder().appName("Classifier")
  16. .config("spark.driver.maxResultSize", "11g")
  17. .config("spark.sql.broadcastTimeout", "36000")
  18. .config("spark.master", "local").getOrCreate()
  19.  
  20. import spark.implicits._
  21.  
  22. val dataDF = spark.read.format("csv")
  23. .option("header", "false")
  24. .option("delimiter", "\t")
  25. .csv(spark.sparkContext.textFile(dataPath, 500).toDS())
  26.  
  27. val testDF = spark.read.format("csv")
  28. .option("header", "true")
  29. .option("delimiter", "\t")
  30. .load(testPath)
  31.  
  32. val trainDF = spark.read.format("csv")
  33. .option("header", "true")
  34. .option("delimiter", "\t")
  35. .load(trainPath)
  36. .join(dataDF, Seq("cuid"),"inner")
  37.  
  38. spark.stop()
  39. }
  40. }
Add Comment
Please, Sign In to add comment