Guest User

Untitled

a guest
Jan 22nd, 2018
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.84 KB | None | 0 0
  1. /**
  2. This class allows usage of CoreNLP in Spark, creating an instance of the pipeline on each worker so that the
  3. code can run in parallel.
  4.  
  5. @param annotators: the CoreNLP annotator pipeline
  6. @param params: the parameters desired for the annotators
  7. */
  8. class NLPPipeline(annotators: String, params: Tuple2[String, String]*) extends Serializable {
  9. import edu.stanford.nlp.pipeline._
  10. import java.util.Properties
  11.  
  12. @transient private var nlpPipeline: StanfordCoreNLP = _
  13.  
  14. /**
  15. Returns a CoreNLP pipeline local to the worker, using the constructor parameters
  16. */
  17. private def getOrCreatePipeline(): StanfordCoreNLP = {
  18. if (nlpPipeline == null) {
  19. val props = new Properties()
  20. props.setProperty("annotators", annotators)
  21. if (params.nonEmpty) params.map{p => props.setProperty(p._1, p._2)}
  22. nlpPipeline = new StanfordCoreNLP(props)
  23. }
  24. nlpPipeline
  25. }
  26.  
  27. /**
  28. Basic step of the pipeline, transforming any text into a CoreNLP document.
  29.  
  30. @param keyword: the text to be transformed
  31. */
  32. def transform(keyword: String) = {
  33. val pipeline = getOrCreatePipeline()
  34. pipeline.process(keyword)
  35. }
  36. }
  37.  
  38. /**
  39. Example object implementing the lemmatization pipeline
  40. */
  41. object Lemma extends NLPPipeline("tokenize, ssplit, pos, lemma") {
  42. import edu.stanford.nlp.ling.CoreAnnotations._
  43. import scala.collection.JavaConversions._
  44.  
  45. /**
  46. Helper class to give nice structure to the results in a DataFrame
  47. */
  48. case class Lemmas(tokens: Seq[String], lemmas: Seq[String])
  49.  
  50. /**
  51. udf to run the pipeline on a dataframe column.
  52. */
  53. def lemmatize = udf((keyword: String) => {
  54. val doc = transform(keyword)
  55. val tokens = doc.get(classOf[SentencesAnnotation]).flatMap(_.get(classOf[TokensAnnotation]))
  56. Lemmas(tokens.map(_.get(classOf[TextAnnotation])), tokens.map(_.get(classOf[LemmaAnnotation])))
  57. })
  58. }
Add Comment
Please, Sign In to add comment