Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package foo;
- import edu.stanford.nlp.fsm.ExactGrammarCompactor;
- import edu.stanford.nlp.io.IOUtils;
- import edu.stanford.nlp.io.NumberRangeFileFilter;
- import edu.stanford.nlp.io.NumberRangesFileFilter;
- import edu.stanford.nlp.ling.*;
- import edu.stanford.nlp.objectbank.TokenizerFactory;
- import edu.stanford.nlp.parser.ViterbiParser;
- import edu.stanford.nlp.parser.KBestViterbiParser;
- import edu.stanford.nlp.process.DocumentPreprocessor;
- import edu.stanford.nlp.util.Function;
- import edu.stanford.nlp.process.WhitespaceTokenizer;
- import edu.stanford.nlp.trees.*;
- import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
- import edu.stanford.nlp.util.Generics;
- import edu.stanford.nlp.util.Numberer;
- import edu.stanford.nlp.util.Pair;
- import edu.stanford.nlp.util.Timing;
- import edu.stanford.nlp.util.ScoredObject;
- import java.io.*;
- import java.text.DecimalFormat;
- import java.text.NumberFormat;
- import java.util.*;
- import java.util.zip.GZIPOutputStream;
- import java.util.*;
- import edu.stanford.nlp.trees.*;
- import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
- import edu.stanford.nlp.process.PTBTokenizer;
- public class RunStanfordParser {
- /**
- *
- * @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse
- * @throws Exception
- */
- public static void main(String[] args) throws Exception {
- // input format: data directory, and output directory
- String parserFileOrUrl=args[0];
- String fileToParse=args[1];
- LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser
- //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
- // Call parser on files, and tokenize the contents
- FileInputStream fstream = new FileInputStream(fileToParse);
- DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream
- BufferedReader br = new BufferedReader(new InputStreamReader(in));
- StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage
- PTBTokenizer tkzr; // tokenizer object
- WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object
- // Read File Line By Line
- String strLine;
- while ((strLine = br.readLine()) != null) {
- System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console
- // do all the standard java over-complication to use the stanford parser tokenizer
- sr = new StringReader(strLine);
- tkzr = PTBTokenizer.newPTBTokenizer(sr);
- List toks = tkzr.tokenize();
- System.out.println ("tokens: "+toks);
- Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something
- // Output Option 1: Printing out various data by accessing it programmatically
- // Get words, stemmed words and POS tags
- ArrayList<String> words = new ArrayList();
- ArrayList<String> stems = new ArrayList();
- ArrayList<String> tags = new ArrayList();
- // Get words and Tags
- for (TaggedWord tw : parse.taggedYield()){
- words.add(tw.word());
- tags.add(tw.tag());
- }
- // Get stems
- ls.visitTree(parse); // apply the stemmer to the tree
- for (TaggedWord tw : parse.taggedYield()){
- stems.add(tw.word());
- }
- // Get dependency tree
- TreebankLanguagePack tlp = new PennTreebankLanguagePack();
- GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
- GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
- Collection tdl = gs.typedDependenciesCollapsed();
- // And print!
- System.out.println("words: "+words);
- System.out.println("POStags: "+tags);
- System.out.println("stemmedWordsAndTags: "+stems);
- System.out.println("typedDependencies: "+tdl);
- // Output Option 2: Printing out various data using TreePrint
- // Various TreePrint options
- // "penn", // constituency parse
- // "oneline",
- // rootLabelOnlyFormat,
- // "words",
- // "wordsAndTags", // unstemmed words and pos tags
- // "dependencies", // unlabeled dependency parse
- // "typedDependencies", // dependency parse
- // "typedDependenciesCollapsed",
- // "latexTree",
- // "collocations",
- // "semanticGraph"
- // Print using TreePrint with various options
- //TreePrint tp = new TreePrint("wordsAndTags,typedDependencies");
- //tp.printTree(parse);
- System.out.println(); // separate output lines
- }
- }
- }
Add Comment
Please, Sign In to add comment