SHARE
TWEET

StanfordLemmatizer

a guest Apr 16th, 2019 74 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import java.io.*;
  2. import java.nio.file.Files;
  3. import java.nio.file.Paths;
  4. import java.util.LinkedList;
  5. import java.util.List;
  6. import java.util.Properties;
  7.  
  8. import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
  9. import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
  10. import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
  11. import edu.stanford.nlp.ling.CoreLabel;
  12. import edu.stanford.nlp.pipeline.Annotation;
  13. import edu.stanford.nlp.pipeline.StanfordCoreNLP;
  14. import edu.stanford.nlp.util.CoreMap;
  15.  
  16. public class StanfordLemmatizer {
  17.  
  18.     protected StanfordCoreNLP pipeline;
  19.  
  20.     public StanfordLemmatizer() {
  21.         // Create StanfordCoreNLP object properties, with POS tagging
  22.         // (required for lemmatization), and lemmatization
  23.         Properties props;
  24.         props = new Properties();
  25.         props.put("annotators", "tokenize, ssplit, pos, lemma");
  26.  
  27.         /*
  28.          * This is a pipeline that takes in a string and returns various analyzed linguistic forms.
  29.          * The String is tokenized via a tokenizer (such as PTBTokenizerAnnotator),
  30.          * and then other sequence model style annotation can be used to add things like lemmas,
  31.          * POS tags, and named entities. These are returned as a list of CoreLabels.
  32.          * Other analysis components build and store parse trees, dependency graphs, etc.
  33.          *
  34.          * This class is designed to apply multiple Annotators to an Annotation.
  35.          * The idea is that you first build up the pipeline by adding Annotators,
  36.          * and then you take the objects you wish to annotate and pass them in and
  37.          * get in return a fully annotated object.
  38.          *
  39.          *  StanfordCoreNLP loads a lot of models, so you probably
  40.          *  only want to do this once per execution
  41.          */
  42.         this.pipeline = new StanfordCoreNLP(props);
  43.     }
  44.  
  45.     public List<String> lemmatize(String documentText)
  46.     {
  47.         List<String> lemmas = new LinkedList<String>();
  48.         // Create an empty Annotation just with the given text
  49.         Annotation document = new Annotation(documentText);
  50.         // run all Annotators on this text
  51.         this.pipeline.annotate(document);
  52.         // Iterate over all of the sentences found
  53.         List<CoreMap> sentences = document.get(SentencesAnnotation.class);
  54.         for(CoreMap sentence: sentences) {
  55.             // Iterate over all tokens in a sentence
  56.             for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
  57.                 // Retrieve and add the lemma for each word into the
  58.                 // list of lemmas
  59.                 lemmas.add(token.get(LemmaAnnotation.class));
  60.             }
  61.         }
  62.         return lemmas;
  63.     }
  64.  
  65.  
  66.     public static void main(String[] args) throws IOException {
  67.  
  68.         // Read file text_lemmatizer et convert to String
  69.         String contents = new String(Files.readAllBytes(Paths.get("Vocabulaire_Attributes.txt")));
  70.         // System.out.println(contents);
  71.  
  72.         System.out.println("Starting Stanford Lemmatizer ...");
  73.  
  74.         StanfordLemmatizer slem = new StanfordLemmatizer();
  75.         System.out.println(slem.lemmatize(contents));
  76.     }
  77.  
  78. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top