Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- val rawData = sc.textFile("file:///search/dje/spark-ml/train_noheader.tsv")
- val records = rawData.map(line => line.split("\t"))
- records.first()
- import org.apache.spark.mllib.regression.LabeledPoint
- import org.apache.spark.mllib.linalg.Vectors
- val data = records.map { r =>
- val trimmed = r.map(_.replaceAll("\"",""))
- val label = trimmed(r.size - 1).toInt
- val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
- LabeledPoint(label, Vectors.dense(features))
- }
- data.cache
- val numData = data.count
- val nbData = records.map { r =>
- val trimmed = r.map(_.replaceAll("\"",""))
- val label = trimmed(r.size - 1).toInt
- val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d )
- LabeledPoint(label, Vectors.dense(features))
- }
- import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
- import org.apache.spark.mllib.classification.SVMWithSGD
- import org.apache.spark.mllib.classification.NaiveBayes
- import org.apache.spark.mllib.tree.DecisionTree
- import org.apache.spark.mllib.tree.configuration.Algo
- import org.apache.spark.mllib.tree.impurity.Entropy
- val numIterations = 10
- val maxTreeDepth = 5
- val lrModel = LogisticRegressionWithSGD.train(data, numIterations)
- val svmModel = SVMWithSGD.train(data, numIterations)
- val nbMobel = NaiveBayes.train(nbdata)
- val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement