Guest User

Untitled

a guest
May 25th, 2018
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.96 KB | None | 0 0
  1. import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
  2. import com.johnsnowlabs.nlp.annotators.{Normalizer, Stemmer, Tokenizer}
  3. import com.johnsnowlabs.nlp.annotator._
  4. import com.johnsnowlabs.nlp.base._
  5. import com.johnsnowlabs.util.Benchmark
  6. import org.apache.spark.ml.feature.NGram
  7.  
  8. import org.apache.spark.ml.Pipeline
  9. import org.apache.spark.ml.feature.{StopWordsRemover, IDF, HashingTF, CountVectorizer, Word2Vec}
  10.  
  11. import org.apache.spark.sql.{Row, SparkSession}
  12. import org.apache.spark.sql.functions._
  13.  
  14. val documentAssembler = new DocumentAssembler()
  15. .setInputCol(textColumnName)
  16. .setOutputCol("document")
  17.  
  18. val sentenceDetector = new SentenceDetector()
  19. .setInputCols(Array("document"))
  20. .setOutputCol("sentence")
  21.  
  22. val token = new Tokenizer()
  23. .setInputCols(Array("document"))
  24. .setOutputCol("token")
  25.  
  26. val normalizer = new Normalizer()
  27. .setInputCols(Array("token"))
  28. .setOutputCol("normalized")
  29.  
  30. val stemmer = new Stemmer()
  31. .setInputCols(Array("normalized"))
  32. .setOutputCol("stem")
  33.  
  34. val posOptions = Map("format" -> "text")
  35.  
  36. val posTagger = new PerceptronApproach()
  37. .setNIterations(5)
  38. .setInputCols(Array("sentence", "token"))
  39. .setOutputCol("pos")
  40. .setCorpus(path = "hdfs:///input/nlp/pos-tagger/masc_tagged/data/*", delimiter = "_", readAs = "SPARK_DATASET", options = posOptions)
  41.  
  42. val token_finisher = new Finisher()
  43. .setInputCols("normalized")
  44. .setOutputCols("tokens_array")
  45. .setCleanAnnotations(false)
  46. .setOutputAsArray(true)
  47.  
  48. val pipeline = new Pipeline()
  49. .setStages(Array(
  50. documentAssembler,
  51. sentenceDetector,
  52. token,
  53. normalizer,
  54. stemmer,
  55. posTagger,
  56. token_finisher
  57. ))
  58.  
  59. val model = Benchmark.time("Time to train model") {
  60. pipeline.fit(test) // test.count = es20: Long = 52414 (Wikipedia Page's title)
  61. }
  62. /*
  63. Time to train model: 1364.868964391sec
  64. model: org.apache.spark.ml.PipelineModel = pipeline_8df5ba357611
  65. */
  66.  
  67. val pipeLineDF = Benchmark.time("Time for prediction") {
  68. model.transform(training)
  69. }
  70. /*
  71. Time for prediction: 0.136970197sec
  72. pipeLineDF: org.apache.spark.sql.DataFrame = [id: string, title: string ... 7 more fields]
  73. */
  74.  
  75. pipeLineDF.printSchema
  76. /*
  77. root
  78. |-- id: string (nullable = true)
  79. |-- title: string (nullable = true)
  80. |-- document: array (nullable = true)
  81. | |-- element: struct (containsNull = true)
  82. | | |-- annotatorType: string (nullable = true)
  83. | | |-- begin: integer (nullable = false)
  84. | | |-- end: integer (nullable = false)
  85. | | |-- result: string (nullable = true)
  86. | | |-- metadata: map (nullable = true)
  87. | | | |-- key: string
  88. | | | |-- value: string (valueContainsNull = true)
  89. |-- sentence: array (nullable = true)
  90. | |-- element: struct (containsNull = true)
  91. | | |-- annotatorType: string (nullable = true)
  92. | | |-- begin: integer (nullable = false)
  93. | | |-- end: integer (nullable = false)
  94. | | |-- result: string (nullable = true)
  95. | | |-- metadata: map (nullable = true)
  96. | | | |-- key: string
  97. | | | |-- value: string (valueContainsNull = true)
  98. |-- token: array (nullable = true)
  99. | |-- element: struct (containsNull = true)
  100. | | |-- annotatorType: string (nullable = true)
  101. | | |-- begin: integer (nullable = false)
  102. | | |-- end: integer (nullable = false)
  103. | | |-- result: string (nullable = true)
  104. | | |-- metadata: map (nullable = true)
  105. | | | |-- key: string
  106. | | | |-- value: string (valueContainsNull = true)
  107. |-- normalized: array (nullable = true)
  108. | |-- element: struct (containsNull = true)
  109. | | |-- annotatorType: string (nullable = true)
  110. | | |-- begin: integer (nullable = false)
  111. | | |-- end: integer (nullable = false)
  112. | | |-- result: string (nullable = true)
  113. | | |-- metadata: map (nullable = true)
  114. | | | |-- key: string
  115. | | | |-- value: string (valueContainsNull = true)
  116. |-- stem: array (nullable = true)
  117. | |-- element: struct (containsNull = true)
  118. | | |-- annotatorType: string (nullable = true)
  119. | | |-- begin: integer (nullable = false)
  120. | | |-- end: integer (nullable = false)
  121. | | |-- result: string (nullable = true)
  122. | | |-- metadata: map (nullable = true)
  123. | | | |-- key: string
  124. | | | |-- value: string (valueContainsNull = true)
  125. |-- pos: array (nullable = true)
  126. | |-- element: struct (containsNull = true)
  127. | | |-- annotatorType: string (nullable = true)
  128. | | |-- begin: integer (nullable = false)
  129. | | |-- end: integer (nullable = false)
  130. | | |-- result: string (nullable = true)
  131. | | |-- metadata: map (nullable = true)
  132. | | | |-- key: string
  133. | | | |-- value: string (valueContainsNull = true)
  134. |-- tokens_array: array (nullable = true)
  135. | |-- element: string (containsNull = true)
  136. */
  137. pipeLineDF.count
  138. // res58: Long = 5208697
  139.  
  140. pipeLineDF.select("title", "pos.result").show(100, false)
  141. /*
  142. +-------------------------------------------------------------------+--------------------------------------+
  143. |title |result |
  144. +-------------------------------------------------------------------+--------------------------------------+
  145. |The Sneetches and Other Stories |[DT, NNP, CC, JJ, NNS] |
  146. |Seocho-dong |[NNP] |
  147. |Pelargonium sidoides |[NNP, NNS] |
  148. |El Gran Combo de Puerto Rico |[NNP, NNP, NNP, IN, NNP, NNP] |
  149. |Love Is the Plan the Plan Is Death |[NN, VBZ, DT, NNP, DT, NNP, VBZ, NN] |
  150. |Pratima Kumari |[NNP, NNP] |
  151. |List of tropical and subtropical moist broadleaf forests ecoregions|[NN, IN, JJ, CC, JJ, NN, NN, NNS, NNS]|
  152. |2003–04 Segunda División |[CD, NNP, NNP] |
  153. |Lifetouch |[NNP] |
  154. |Metrostar |[NNP] |
  155. |CANPASS |[NNP] |
  156. |Fallen Angel (UK TV series) |[NNP, NNP, (, NNP, NNP, NN, )] |
  157. |Kuni-kyō |[NNP] |
  158. |Barham Salih |[NNP, NNP] |
  159. |Chokher Bali (film) |[NNP, NNP, (, NN, )] |
  160. |Durio dulcis |[NNP, NN] |
  161. |Florentine painting |[NNP, NN] |
  162. |Zoidogamy |[NNP] |
  163. |PO postcode area |[NNP, NN, NN] |
  164. |Eveleigh, New South Wales |[NNP, ,, NNP, NNP, NNP] |
  165. |Android Nim |[NNP, NNP] |
  166. |Kyle Dunnigan |[NNP, NNP] |
  167. |Jawad Bashir |[NNP, NNP] |
  168. |Continental O-190 |[NNP, NNP] |
  169. |List of 3D graphics libraries |[NN, IN, CD, NNS, NNS] |
  170. |British Universities and Colleges Sport |[JJ, NNP, CC, NNP, NNP] |
  171. |Horns of Hattin |[NNP, IN, NNP] |
  172. |Systemic risk |[NNP, NN] |
  173. |Ho Ching |[NNP, NNP] |
  174. |Blake's Lock |[NNP, POS, NNP] |
  175. |Vincent Buckley |[NNP, NNP] |
  176. |Steve Bozek |[NNP, NNP] |
  177. |The Bird and the Worm |[DT, NNP, CC, DT, NNP] |
  178. |MP3Gain |[NNP] |
  179. |Lost City of the Jungle |[NNP, NNP, IN, DT, NNP] |
  180. |Bible College of Malaysia |[NNP, NNP, IN, NNP] |
  181. |Grease duct |[NNP, NN] |
  182. |Air America (TV series) |[NNP, NNP, (, NN, NN, )] |
  183. |Water Framework Directive |[NNP, NNP, NNP] |
  184. |Regent Hotel |[NNP, NNP] |
  185. |One-shot (comics) |[NNP, (, NNS, )] |
  186. |Before We Were So Rudely Interrupted |[IN, PRP, VBD, RB, RB, NNP] |
  187. |Lindauer Dornier |[NNP, NNP] |
  188. |Mariner Software |[NNP, NNP] |
  189. |The Fisher-Girl and the Crab |[DT, NNP, CC, DT, NNP] |
  190. |7-orthoplex |[NN] |
  191. |French military mission to Japan (1872–80) |[JJ, JJ, NN, TO, NNP, (, CD, )] |
  192. |Hui Liangyu |[NNP, NNP] |
  193. |Christine Arron |[NNP, NNP] |
  194. |Moose test |[NNP, NN] |
  195. |Arrasando (song) |[NNP, (, NN, )] |
  196. |Daydream (1964 film) |[NNP, (, CD, NN, )] |
  197. |Anecdote of Men by the Thousands |[NNP, IN, NN, IN, DT, NNS] |
  198. |Strain (biology) |[NNP, (, NN, )] |
  199. |Haustrinae |[NNP] |
  200. |Cirrus Aircraft |[NNP, NNP] |
  201. |Syracuse High School (Syracuse, Utah) |[NNP, NNP, NNP, (, NNP, ,, NNP, )] |
  202. |Mezamashi TV |[NNP, NN] |
  203. |Vermont statistical areas |[NNP, JJ, NNS] |
  204. |Portugal during World War I |[NNP, IN, NNP, NNP, PRP] |
  205. |Cycles (The Doobie Brothers album) |[NNP, (, DT, NNP, NNP, NN, )] |
  206. |Inferior frontal sulcus |[NNP, JJ, NN] |
  207. |Saskatchewan Highway 41 |[NNP, NNP, CD] |
  208. |Barony Rosendal |[NNP, NNP] |
  209. |Mishima ware |[NNP, NN] |
  210. |Ijon Tichy |[NNP, NNP] |
  211. |Wilusa |[NNP] |
  212. |Thomas Dybdahl |[NNP, NNP] |
  213. |Adam Gardiner |[NNP, NNP] |
  214. |Fournier RF-9 |[NNP, NNP] |
  215. |Ola Sundell |[NNP, NNP] |
  216. |My Barbarian |[PRP$, NN] |
  217. |2004–05 Iraqi Premier League |[CD, NNP, NNP, NNP] |
  218. |Jean Wade Rindlaub |[NNP, NNP, NNP] |
  219. |Miskel Spillman |[NNP, NNP] |
  220. |Bonytail chub |[NNP, NN] |
  221. |Japanese Journal of Applied Physics |[JJ, NNP, IN, NNP, NNP] |
  222. |Disembowelment (band) |[NNP, (, NN, )] |
  223. |Brethren of the Coast |[NNP, IN, DT, NNP] |
  224. |Fly (exercise) |[NNP, (, NN, )] |
  225. |Mathilde Krim |[NNP, NNP] |
  226. |Usman Tariq |[NNP, NNP] |
  227. |Christopher Plunkett, 1st Baron of Dunsany |[NNP, NNP, ,, CD, NNP, IN, NNP] |
  228. |Wanna Get to Know You |[NNP, VB, TO, VB, PRP] |
  229. |Yaxuna |[NNP] |
  230. |Glass (Index Case album) |[NN, (, NNP, NN, NN, )] |
  231. |Christy Hemme |[NNP, NNP] |
  232. |Zod |[NNP] |
  233. |River City High |[NNP, NNP, NNP] |
  234. |William Fleming High School |[NNP, NNP, NNP, NNP] |
  235. |Wee Waa |[NNP, NNP] |
  236. |I3 |[NNP] |
  237. |Stephen V. Cole |[NNP, NNP, NNP] |
  238. |Royal Australian Army Nursing Corps |[NNP, JJ, NNP, NNP, NNP] |
  239. |The Courier |[DT, NNP] |
  240. |Olof Johansson |[NNP, NNP] |
  241. |Solicitor General of the United States |[NNP, NNP, IN, DT, NNP, NNPS] |
  242. |MWR |[NNP] |
  243. |Michael Boyer |[NNP, NNP] |
  244. |Common Fund for Commodities |[NNP, NNP, IN, NNP] |
  245. +-------------------------------------------------------------------+--------------------------------------+
  246. only showing top 100 rows
  247. */
Add Comment
Please, Sign In to add comment