Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package adapters
- // Esse é um exemplo de classificador com multiplas classes utilizando Naive Bayes
- // FONTE DO DATASET: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html
- // ===========================================================================================================
- // Aproveitei para fazer alguns testes de como salvar o modelo já treinado:
- // PARA SALVAR:
- // modelo.save("modelo")
- // PARA CARREGAR (via console Scala REPL (Read-Evaluate-Print Loop)):
- // import org.apache.spark.ml.PipelineModel
- // val modelo_treinado = PipelineModel.load("modelo")
- // PARA USAR:
- // val data2 = spark.read.format("libsvm").option("header", "false").option("inferSchema", "true").load("news20.full")
- // modelo_treinado.transform(data2)
- //
- // Exemplo: Carregando o modelo Naive Bayes já salvo da etapa anterior
- import org.apache.spark.sql.SparkSession
- import org.apache.spark.ml.Pipeline
- import org.apache.spark.ml.classification.NaiveBayes
- import org.apache.spark.ml.classification.LogisticRegression
- import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
- import org.apache.log4j._
- import org.apache.spark.sql._
- import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer, StopWordsRemover}
- import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}
- import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
- import org.apache.spark.mllib.evaluation.MulticlassMetrics
- import org.apache.spark.sql.functions._
- object SparkML4_Testing {
- def main(args: Array[String]): Unit = {
- Logger.getLogger("org").setLevel(Level.OFF)
- Logger.getLogger("akka").setLevel(Level.OFF)
- Logger.getLogger("org.apache.spark.SparkContext").setLevel(Level.OFF)
- // Spark Session
- val spark = (SparkSession.builder()
- .master("local[*]")
- .getOrCreate())
- spark.sparkContext.setLogLevel("OFF")
- // Carrega os dados de treinamento e teste
- val entrada = "data/IMDB/trainning1.csv"
- val test = "data/IMDB/test1.csv"
- /*val trainData = (spark.read.format("csv") //desabilitado para os testes nos métodos IFEEL
- .option("header","false")
- .option("delimiter","\t")
- .option("inferSchema","true")
- .load(entrada).toDF("sentence","label"))*/
- val testData = (spark.read.format("csv")
- .option("header","false")
- .option("delimiter","\t")
- .option("inferSchema","true")
- .load(test).toDF("sentence","label"))
- /*
- // CONVERTENDO TEXTO PARA O FORMATO LIBSVM ========================================
- // Define tokenizador
- val tokenizer = new Tokenizer()
- .setInputCol("sentence")
- .setOutputCol("words")
- // Define removedor de stopwords
- val remover = new StopWordsRemover()
- .setInputCol("words")
- .setOutputCol("filtered")
- // Cria e configura TF-IDF
- val numFeatures = 5000
- //val minDocFreq = 5 //NB
- val minDocFreq = 1 //RF
- // TF
- val hashingTF = new HashingTF()
- .setInputCol("filtered")
- .setOutputCol("tf")
- .setNumFeatures(numFeatures)
- // IDF
- val idf = new IDF()
- .setInputCol("tf")
- .setOutputCol("features")
- .setMinDocFreq(minDocFreq)
- // Carrega instancia do classificador NaiveBayes
- //val modelo = new NaiveBayes().setSmoothing(0.2)
- //val modelo = new LogisticRegression()
- val modelo = new RandomForestClassifier().setNumTrees(5).setMaxBins(27).setMaxDepth(15)
- val pipeline = new Pipeline()
- .setStages(Array(tokenizer, remover, hashingTF, idf, modelo))
- val trainned_model = pipeline.fit(trainData)
- //executa a classificação de sentimentos
- val predicoes = trainned_model.transform(testData)
- //predicoes.show()
- //predicoes.printSchema()
- */
- //TESTES COM IFEEL ============================================================
- import spark.implicits._
- var resultado: Option[DataFrame] = None
- /*import adapters.Afinn
- val afinn = spark.udf.register("Metodo", (input: String) => { Afinn.as(input) })
- resultado = Some(testData.withColumn("analise", afinn(testData.col("sentence"))))*/
- /*import adapters.Emolex
- val emolex = spark.udf.register("Metodo", (input: String) => { Emolex.as(input) })
- resultado = Some(testData.withColumn("analise", emolex(testData.col("sentence"))))*/
- /*import adapters.Emoticons
- val emoticons = spark.udf.register("Metodo", (input: String) => { Emoticons.as(input) })
- resultado = Some(testData.withColumn("analise", emoticons(testData.col("sentence"))))*/
- /*import adapters.EmoticonDS
- val emoticonDS = spark.udf.register("Metodo", (input: String) => { EmoticonDS.as(input) })
- resultado = Some(testData.withColumn("analise", emoticonDS(testData.col("sentence"))))*/
- /*import adapters.HappinessIndex
- val happiness = spark.udf.register("Metodo", (input: String) => { HappinessIndex.as(input) })
- resultado = Some(testData.withColumn("analise", happiness(testData.col("sentence"))))*/
- /*import adapters.MPQA
- val mpqa = spark.udf.register("Metodo", (input: String) => { MPQA.as(input) })
- resultado = Some(testData.withColumn("analise", mpqa(testData.col("sentence"))))*/
- /*import adapters.NRC
- val nrc = spark.udf.register("Metodo", (input: String) => { NRC.as(input) })
- resultado = Some(testData.withColumn("analise", nrc(testData.col("sentence"))))*/
- /*import adapters.Opinion
- val opinion = spark.udf.register("Metodo", (input: String) => { Opinion.as(input) })
- resultado = Some(testData.withColumn("analise", opinion(testData.col("sentence"))))*/
- /*import adapters.PanasT
- val panasT = spark.udf.register("Metodo", (input: String) => { PanasT.as(input) })
- resultado = Some(testData.withColumn("analise", panasT(testData.col("sentence"))))*/
- /*import adapters.Sann
- val sann = spark.udf.register("Metodo", (input: String) => { Sann.as(input) })
- resultado = Some(testData.withColumn("analise", sann(testData.col("sentence"))))*/
- /*import adapters.Sasa
- val sasa = spark.udf.register("Metodo", (input: String) => { Sasa.as(input) })
- resultado = Some(testData.withColumn("analise", sasa(testData.col("sentence"))))*/
- /*import adapters.SenticNet
- val senticNet = spark.udf.register("Metodo", (input: String) => { SenticNet.as(input) })
- resultado = Some(testData.withColumn("analise", senticNet(testData.col("sentence"))))*/
- /*import adapters.Sentiment140
- val sentiment140 = spark.udf.register("Metodo", (input: String) => { Sentiment140.as(input) })
- resultado = Some(testData.withColumn("analise", sentiment140(testData.col("sentence"))))*/
- /*import adapters.SentiStrength
- val sentiStrength = spark.udf.register("Metodo", (input: String) => { SentiStrength.as(input) })
- resultado = Some(testData.withColumn("analise", sentiStrength(testData.col("sentence"))))*/
- import adapters.SentiWordNet
- val sentiWordNet = spark.udf.register("Metodo", (input: String) => { SentiWordNet.as(input) })
- resultado = Some(testData.withColumn("analise", sentiWordNet(testData.col("sentence"))))
- val predictions = resultado.get.select( when($"analise" === -1, 0.0)
- .when($"analise" === 0, 1.0)
- .when($"analise" === 1, 2.0)
- .as("predicao"))
- .rdd.map(_.getDouble(0))
- val labels = resultado.get.select("label")
- .rdd.map(_.getDouble(0))
- //MÉTRICS ===============================================================
- //val predictions = resultado.get.select("prediction").rdd.map(_.getDouble(0)) //Spark implementations
- //val labels = resultado.get.select("label").rdd.map(_.getDouble(0)) //Spark implementations
- val predictionAndLabels = predictions.zip(labels)
- val pl = predictionAndLabels.collect()
- val metrics = new MulticlassMetrics(predictionAndLabels)
- // Confusion matrix
- //println("Confusion matrix:")
- println(metrics.confusionMatrix)
- // Overall Statistics
- val accuracy = metrics.accuracy
- //println("Summary Statistics")
- //println(s"Accuracy = $accuracy")
- println(accuracy)
- // Precision by label
- val labels2 = metrics.labels
- labels2.foreach { l =>
- //println(s"Precision($l) = " + metrics.precision(l))
- println(metrics.precision(l))
- }
- // Recall by label
- labels2.foreach { l =>
- //println(s"Recall($l) = " + metrics.recall(l))
- println(metrics.recall(l))
- }
- // False positive rate by label
- labels2.foreach { l =>
- //println(s"FPR($l) = " + metrics.falsePositiveRate(l))
- println(metrics.falsePositiveRate(l))
- }
- // F-measure by label
- labels2.foreach { l =>
- //println(s"F1-Score($l) = " + metrics.fMeasure(l))
- println(metrics.fMeasure(l))
- }
- // Weighted stats
- //println(s"Weighted precision = ${metrics.weightedPrecision}")
- //println(s"Weighted recall = ${metrics.weightedRecall}")
- //println(s"Weighted F1 score = ${metrics.weightedFMeasure}")
- //println(s"Weighted false positive rate = ${metrics.weightedFalsePositiveRate}")
- println(metrics.weightedPrecision)
- println(metrics.weightedRecall)
- println(metrics.weightedFMeasure)
- println(metrics.weightedFalsePositiveRate)
- //possible metrics: f1-score, precision, recall, weightedPrecision and weightedRecall
- /*val evaluator = new MulticlassClassificationEvaluator()
- .setLabelCol("label")
- .setPredictionCol("prediction")
- val accuracy = evaluator.setMetricName("accuracy").evaluate(predicoes)
- val weightedPrecision = evaluator.setMetricName("weightedPrecision").evaluate(predicoes)
- val weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(predicoes)
- val f1 = evaluator.setMetricName("f1").evaluate(predicoes)
- println()
- println("Test accuracy = " + accuracy)
- println("Test weightedPrecision = " + weightedPrecision)
- println("Test weightedRecall = " + weightedRecall)
- println("Test f1_score = " + f1)*/
- /*import spark.implicits._
- val toDouble = udf[Double, String]( _.toDouble)
- val predictionAndLabels = predicoes.withColumn("prediction", predicoes("prediction"))
- .withColumn("label", predicoes("label"))
- .rdd.map(r => (r.getDouble(0), r.getDouble(1)))*/
- /*val metrics = new MulticlassMetrics(predictionAndLabels)
- println("Confusion Matrix: " + metrics.confusionMatrix)*/
- spark.stop()
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement