Dundre32

Untitled

Apr 26th, 2020
435
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. //Pipeline
  2. import org.apache.spark.ml.{Pipeline, PipelineModel}
  3. import org.apache.spark.ml.feature.ChiSqSelectorModel
  4.  
  5. val indexer = new StringIndexer()
  6.   .setInputCol("category")
  7.   .setOutputCol("categoryIndex")
  8.  
  9. val regexTokenizer = new RegexTokenizer()
  10.   .setInputCol("reviewText")
  11.   .setOutputCol("words")
  12.   .setPattern("\\p{Digit}|\\p{Space}|[\\p{Punct}&&[^']]|(?<![a-zA-Z])'|'(?![a-zA-Z])|\\“")
  13.  
  14. val remover = new StopWordsRemover()
  15.   .setInputCol("words")
  16.   .setOutputCol("filtered")
  17.  
  18. val cvModel = new CountVectorizer()
  19.   .setInputCol("filtered")
  20.   .setOutputCol("features")
  21.   .setMinDF(2)
  22.  
  23. val idf = new IDF().setInputCol("features").setOutputCol("weightedfeatures")
  24.  
  25. val selector = new ChiSqSelector()
  26. .setNumTopFeatures(4000)
  27. //.setNumTopFeatures(5)
  28. .setFeaturesCol("weightedfeatures").setLabelCol("categoryIndex").setOutputCol("selectedFeatures")
  29.  
  30. val pipeline = new Pipeline()
  31.   .setStages(Array(indexer, regexTokenizer, remover, cvModel, idf, selector))
  32.  
  33. val model = pipeline.fit(dfNew)
  34.  
  35.  
  36. //Extract model values
  37. val vocOfTerms = model.stages(3).asInstanceOf[CountVectorizerModel].vocabulary
  38. val selectedFeatures = model.stages(5).asInstanceOf[org.apache.spark.ml.feature.ChiSqSelectorModel].selectedFeatures
  39.  
  40.  
  41. //Get the appropriate terms from the vocabulary by the selected features list, load into a sortedset, then write out the content with //whitespace delimited
  42. import scala.collection.mutable.SortedSet
  43. import java.io._
  44.  
  45. val sortedSet1: SortedSet[String] = SortedSet()
  46. selectedFeatures.foreach{x => sortedSet1 += vocOfTerms(x)}
  47.  
  48.  
  49. val file = "output_ds.txt"
  50. val writer = new BufferedWriter(new FileWriter(file))
  51.  for (line <- sortedSet1) {
  52.         writer.write(line + " ")
  53.     }
  54. //sortedSet1.foreach(writer.write)
  55. writer.close()
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×