Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
- import com.johnsnowlabs.nlp.annotators.{Normalizer, Stemmer, Tokenizer}
- import com.johnsnowlabs.nlp.annotator._
- import com.johnsnowlabs.nlp.base._
- import com.johnsnowlabs.util.Benchmark
- import org.apache.spark.ml.feature.NGram
- import org.apache.spark.ml.Pipeline
- import org.apache.spark.ml.feature.{StopWordsRemover, IDF, HashingTF, CountVectorizer, Word2Vec}
- import org.apache.spark.sql.{Row, SparkSession}
- import org.apache.spark.sql.functions._
- val documentAssembler = new DocumentAssembler()
- .setInputCol(textColumnName)
- .setOutputCol("document")
- val sentenceDetector = new SentenceDetector()
- .setInputCols(Array("document"))
- .setOutputCol("sentence")
- val token = new Tokenizer()
- .setInputCols(Array("document"))
- .setOutputCol("token")
- val normalizer = new Normalizer()
- .setInputCols(Array("token"))
- .setOutputCol("normalized")
- val stemmer = new Stemmer()
- .setInputCols(Array("normalized"))
- .setOutputCol("stem")
- val posOptions = Map("format" -> "text")
- val posTagger = new PerceptronApproach()
- .setNIterations(5)
- .setInputCols(Array("sentence", "token"))
- .setOutputCol("pos")
- .setCorpus(path = "hdfs:///input/nlp/pos-tagger/masc_tagged/data/*", delimiter = "_", readAs = "SPARK_DATASET", options = posOptions)
- val token_finisher = new Finisher()
- .setInputCols("normalized")
- .setOutputCols("tokens_array")
- .setCleanAnnotations(false)
- .setOutputAsArray(true)
- val pipeline = new Pipeline()
- .setStages(Array(
- documentAssembler,
- sentenceDetector,
- token,
- normalizer,
- stemmer,
- posTagger,
- token_finisher
- ))
- val model = Benchmark.time("Time to train model") {
- pipeline.fit(test) // test.count = es20: Long = 52414 (Wikipedia Page's title)
- }
- /*
- Time to train model: 1364.868964391sec
- model: org.apache.spark.ml.PipelineModel = pipeline_8df5ba357611
- */
- val pipeLineDF = Benchmark.time("Time for prediction") {
- model.transform(training)
- }
- /*
- Time for prediction: 0.136970197sec
- pipeLineDF: org.apache.spark.sql.DataFrame = [id: string, title: string ... 7 more fields]
- */
- pipeLineDF.printSchema
- /*
- root
- |-- id: string (nullable = true)
- |-- title: string (nullable = true)
- |-- document: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- sentence: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- token: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- normalized: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- stem: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- pos: array (nullable = true)
- | |-- element: struct (containsNull = true)
- | | |-- annotatorType: string (nullable = true)
- | | |-- begin: integer (nullable = false)
- | | |-- end: integer (nullable = false)
- | | |-- result: string (nullable = true)
- | | |-- metadata: map (nullable = true)
- | | | |-- key: string
- | | | |-- value: string (valueContainsNull = true)
- |-- tokens_array: array (nullable = true)
- | |-- element: string (containsNull = true)
- */
- pipeLineDF.count
- // res58: Long = 5208697
- pipeLineDF.select("title", "pos.result").show(100, false)
- /*
- +-------------------------------------------------------------------+--------------------------------------+
- |title |result |
- +-------------------------------------------------------------------+--------------------------------------+
- |The Sneetches and Other Stories |[DT, NNP, CC, JJ, NNS] |
- |Seocho-dong |[NNP] |
- |Pelargonium sidoides |[NNP, NNS] |
- |El Gran Combo de Puerto Rico |[NNP, NNP, NNP, IN, NNP, NNP] |
- |Love Is the Plan the Plan Is Death |[NN, VBZ, DT, NNP, DT, NNP, VBZ, NN] |
- |Pratima Kumari |[NNP, NNP] |
- |List of tropical and subtropical moist broadleaf forests ecoregions|[NN, IN, JJ, CC, JJ, NN, NN, NNS, NNS]|
- |2003–04 Segunda División |[CD, NNP, NNP] |
- |Lifetouch |[NNP] |
- |Metrostar |[NNP] |
- |CANPASS |[NNP] |
- |Fallen Angel (UK TV series) |[NNP, NNP, (, NNP, NNP, NN, )] |
- |Kuni-kyō |[NNP] |
- |Barham Salih |[NNP, NNP] |
- |Chokher Bali (film) |[NNP, NNP, (, NN, )] |
- |Durio dulcis |[NNP, NN] |
- |Florentine painting |[NNP, NN] |
- |Zoidogamy |[NNP] |
- |PO postcode area |[NNP, NN, NN] |
- |Eveleigh, New South Wales |[NNP, ,, NNP, NNP, NNP] |
- |Android Nim |[NNP, NNP] |
- |Kyle Dunnigan |[NNP, NNP] |
- |Jawad Bashir |[NNP, NNP] |
- |Continental O-190 |[NNP, NNP] |
- |List of 3D graphics libraries |[NN, IN, CD, NNS, NNS] |
- |British Universities and Colleges Sport |[JJ, NNP, CC, NNP, NNP] |
- |Horns of Hattin |[NNP, IN, NNP] |
- |Systemic risk |[NNP, NN] |
- |Ho Ching |[NNP, NNP] |
- |Blake's Lock |[NNP, POS, NNP] |
- |Vincent Buckley |[NNP, NNP] |
- |Steve Bozek |[NNP, NNP] |
- |The Bird and the Worm |[DT, NNP, CC, DT, NNP] |
- |MP3Gain |[NNP] |
- |Lost City of the Jungle |[NNP, NNP, IN, DT, NNP] |
- |Bible College of Malaysia |[NNP, NNP, IN, NNP] |
- |Grease duct |[NNP, NN] |
- |Air America (TV series) |[NNP, NNP, (, NN, NN, )] |
- |Water Framework Directive |[NNP, NNP, NNP] |
- |Regent Hotel |[NNP, NNP] |
- |One-shot (comics) |[NNP, (, NNS, )] |
- |Before We Were So Rudely Interrupted |[IN, PRP, VBD, RB, RB, NNP] |
- |Lindauer Dornier |[NNP, NNP] |
- |Mariner Software |[NNP, NNP] |
- |The Fisher-Girl and the Crab |[DT, NNP, CC, DT, NNP] |
- |7-orthoplex |[NN] |
- |French military mission to Japan (1872–80) |[JJ, JJ, NN, TO, NNP, (, CD, )] |
- |Hui Liangyu |[NNP, NNP] |
- |Christine Arron |[NNP, NNP] |
- |Moose test |[NNP, NN] |
- |Arrasando (song) |[NNP, (, NN, )] |
- |Daydream (1964 film) |[NNP, (, CD, NN, )] |
- |Anecdote of Men by the Thousands |[NNP, IN, NN, IN, DT, NNS] |
- |Strain (biology) |[NNP, (, NN, )] |
- |Haustrinae |[NNP] |
- |Cirrus Aircraft |[NNP, NNP] |
- |Syracuse High School (Syracuse, Utah) |[NNP, NNP, NNP, (, NNP, ,, NNP, )] |
- |Mezamashi TV |[NNP, NN] |
- |Vermont statistical areas |[NNP, JJ, NNS] |
- |Portugal during World War I |[NNP, IN, NNP, NNP, PRP] |
- |Cycles (The Doobie Brothers album) |[NNP, (, DT, NNP, NNP, NN, )] |
- |Inferior frontal sulcus |[NNP, JJ, NN] |
- |Saskatchewan Highway 41 |[NNP, NNP, CD] |
- |Barony Rosendal |[NNP, NNP] |
- |Mishima ware |[NNP, NN] |
- |Ijon Tichy |[NNP, NNP] |
- |Wilusa |[NNP] |
- |Thomas Dybdahl |[NNP, NNP] |
- |Adam Gardiner |[NNP, NNP] |
- |Fournier RF-9 |[NNP, NNP] |
- |Ola Sundell |[NNP, NNP] |
- |My Barbarian |[PRP$, NN] |
- |2004–05 Iraqi Premier League |[CD, NNP, NNP, NNP] |
- |Jean Wade Rindlaub |[NNP, NNP, NNP] |
- |Miskel Spillman |[NNP, NNP] |
- |Bonytail chub |[NNP, NN] |
- |Japanese Journal of Applied Physics |[JJ, NNP, IN, NNP, NNP] |
- |Disembowelment (band) |[NNP, (, NN, )] |
- |Brethren of the Coast |[NNP, IN, DT, NNP] |
- |Fly (exercise) |[NNP, (, NN, )] |
- |Mathilde Krim |[NNP, NNP] |
- |Usman Tariq |[NNP, NNP] |
- |Christopher Plunkett, 1st Baron of Dunsany |[NNP, NNP, ,, CD, NNP, IN, NNP] |
- |Wanna Get to Know You |[NNP, VB, TO, VB, PRP] |
- |Yaxuna |[NNP] |
- |Glass (Index Case album) |[NN, (, NNP, NN, NN, )] |
- |Christy Hemme |[NNP, NNP] |
- |Zod |[NNP] |
- |River City High |[NNP, NNP, NNP] |
- |William Fleming High School |[NNP, NNP, NNP, NNP] |
- |Wee Waa |[NNP, NNP] |
- |I3 |[NNP] |
- |Stephen V. Cole |[NNP, NNP, NNP] |
- |Royal Australian Army Nursing Corps |[NNP, JJ, NNP, NNP, NNP] |
- |The Courier |[DT, NNP] |
- |Olof Johansson |[NNP, NNP] |
- |Solicitor General of the United States |[NNP, NNP, IN, DT, NNP, NNPS] |
- |MWR |[NNP] |
- |Michael Boyer |[NNP, NNP] |
- |Common Fund for Commodities |[NNP, NNP, IN, NNP] |
- +-------------------------------------------------------------------+--------------------------------------+
- only showing top 100 rows
- */
Add Comment
Please, Sign In to add comment