Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import org.apache.spark.SparkContext
- import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext, SparkSession}
- import org.apache.spark.sql.functions.broadcast
- object Classification {
- def main(args: Array[String]): Unit = {
- val dataPath = "./mlboot_data.tsv" // 11 GB
- val testPath = "./mlboot_test.tsv" // 6MB
- val trainPath = "./mlboot_train_answers.tsv" // 15 MB
- val spark = SparkSession.builder().appName("Classifier")
- .config("spark.driver.maxResultSize", "11g")
- .config("spark.sql.broadcastTimeout", "36000")
- .config("spark.master", "local").getOrCreate()
- import spark.implicits._
- val dataDF = spark.read.format("csv")
- .option("header", "false")
- .option("delimiter", "\t")
- .csv(spark.sparkContext.textFile(dataPath, 500).toDS())
- val testDF = spark.read.format("csv")
- .option("header", "true")
- .option("delimiter", "\t")
- .load(testPath)
- val trainDF = spark.read.format("csv")
- .option("header", "true")
- .option("delimiter", "\t")
- .load(trainPath)
- .join(dataDF, Seq("cuid"),"inner")
- spark.stop()
- }
- }
Add Comment
Please, Sign In to add comment