Advertisement
Guest User

StanfordLemmatizer

a guest
Apr 16th, 2019
161
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.16 KB | None | 0 0
  1. import java.io.*;
  2. import java.nio.file.Files;
  3. import java.nio.file.Paths;
  4. import java.util.LinkedList;
  5. import java.util.List;
  6. import java.util.Properties;
  7.  
  8. import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
  9. import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
  10. import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
  11. import edu.stanford.nlp.ling.CoreLabel;
  12. import edu.stanford.nlp.pipeline.Annotation;
  13. import edu.stanford.nlp.pipeline.StanfordCoreNLP;
  14. import edu.stanford.nlp.util.CoreMap;
  15.  
  16. public class StanfordLemmatizer {
  17.  
  18. protected StanfordCoreNLP pipeline;
  19.  
  20. public StanfordLemmatizer() {
  21. // Create StanfordCoreNLP object properties, with POS tagging
  22. // (required for lemmatization), and lemmatization
  23. Properties props;
  24. props = new Properties();
  25. props.put("annotators", "tokenize, ssplit, pos, lemma");
  26.  
  27. /*
  28. * This is a pipeline that takes in a string and returns various analyzed linguistic forms.
  29. * The String is tokenized via a tokenizer (such as PTBTokenizerAnnotator),
  30. * and then other sequence model style annotation can be used to add things like lemmas,
  31. * POS tags, and named entities. These are returned as a list of CoreLabels.
  32. * Other analysis components build and store parse trees, dependency graphs, etc.
  33. *
  34. * This class is designed to apply multiple Annotators to an Annotation.
  35. * The idea is that you first build up the pipeline by adding Annotators,
  36. * and then you take the objects you wish to annotate and pass them in and
  37. * get in return a fully annotated object.
  38. *
  39. * StanfordCoreNLP loads a lot of models, so you probably
  40. * only want to do this once per execution
  41. */
  42. this.pipeline = new StanfordCoreNLP(props);
  43. }
  44.  
  45. public List<String> lemmatize(String documentText)
  46. {
  47. List<String> lemmas = new LinkedList<String>();
  48. // Create an empty Annotation just with the given text
  49. Annotation document = new Annotation(documentText);
  50. // run all Annotators on this text
  51. this.pipeline.annotate(document);
  52. // Iterate over all of the sentences found
  53. List<CoreMap> sentences = document.get(SentencesAnnotation.class);
  54. for(CoreMap sentence: sentences) {
  55. // Iterate over all tokens in a sentence
  56. for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
  57. // Retrieve and add the lemma for each word into the
  58. // list of lemmas
  59. lemmas.add(token.get(LemmaAnnotation.class));
  60. }
  61. }
  62. return lemmas;
  63. }
  64.  
  65.  
  66. public static void main(String[] args) throws IOException {
  67.  
  68. // Read file text_lemmatizer et convert to String
  69. String contents = new String(Files.readAllBytes(Paths.get("Vocabulaire_Attributes.txt")));
  70. // System.out.println(contents);
  71.  
  72. System.out.println("Starting Stanford Lemmatizer ...");
  73.  
  74. StanfordLemmatizer slem = new StanfordLemmatizer();
  75. System.out.println(slem.lemmatize(contents));
  76. }
  77.  
  78. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement