Guest User

Untitled

a guest
Aug 28th, 2012
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. package foo;
  2.  
  3. import edu.stanford.nlp.fsm.ExactGrammarCompactor;
  4. import edu.stanford.nlp.io.IOUtils;
  5. import edu.stanford.nlp.io.NumberRangeFileFilter;
  6. import edu.stanford.nlp.io.NumberRangesFileFilter;
  7. import edu.stanford.nlp.ling.*;
  8. import edu.stanford.nlp.objectbank.TokenizerFactory;
  9. import edu.stanford.nlp.parser.ViterbiParser;
  10. import edu.stanford.nlp.parser.KBestViterbiParser;
  11. import edu.stanford.nlp.process.DocumentPreprocessor;
  12. import edu.stanford.nlp.util.Function;
  13. import edu.stanford.nlp.process.WhitespaceTokenizer;
  14. import edu.stanford.nlp.trees.*;
  15. import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack;
  16. import edu.stanford.nlp.util.Generics;
  17. import edu.stanford.nlp.util.Numberer;
  18. import edu.stanford.nlp.util.Pair;
  19. import edu.stanford.nlp.util.Timing;
  20. import edu.stanford.nlp.util.ScoredObject;
  21.  
  22. import java.io.*;
  23. import java.text.DecimalFormat;
  24. import java.text.NumberFormat;
  25. import java.util.*;
  26. import java.util.zip.GZIPOutputStream;
  27. import java.util.*;
  28. import edu.stanford.nlp.trees.*;
  29. import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
  30. import edu.stanford.nlp.process.PTBTokenizer;
  31.  
  32. public class RunStanfordParser {
  33. /**
  34. *
  35. * @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse
  36. * @throws Exception
  37. */
  38. public static void main(String[] args) throws Exception {
  39. // input format: data directory, and output directory
  40. String parserFileOrUrl=args[0];
  41. String fileToParse=args[1];
  42.  
  43. LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser
  44. //lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want
  45.  
  46. // Call parser on files, and tokenize the contents
  47. FileInputStream fstream = new FileInputStream(fileToParse);
  48. DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream
  49. BufferedReader br = new BufferedReader(new InputStreamReader(in));
  50. StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage
  51. PTBTokenizer tkzr; // tokenizer object
  52. WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object
  53.  
  54. // Read File Line By Line
  55. String strLine;
  56. while ((strLine = br.readLine()) != null) {
  57. System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console
  58.  
  59. // do all the standard java over-complication to use the stanford parser tokenizer
  60. sr = new StringReader(strLine);
  61. tkzr = PTBTokenizer.newPTBTokenizer(sr);
  62. List toks = tkzr.tokenize();
  63. System.out.println ("tokens: "+toks);
  64.  
  65. Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something
  66.  
  67. // Output Option 1: Printing out various data by accessing it programmatically
  68.  
  69. // Get words, stemmed words and POS tags
  70. ArrayList<String> words = new ArrayList();
  71. ArrayList<String> stems = new ArrayList();
  72. ArrayList<String> tags = new ArrayList();
  73.  
  74. // Get words and Tags
  75. for (TaggedWord tw : parse.taggedYield()){
  76. words.add(tw.word());
  77. tags.add(tw.tag());
  78. }
  79.  
  80. // Get stems
  81. ls.visitTree(parse); // apply the stemmer to the tree
  82. for (TaggedWord tw : parse.taggedYield()){
  83. stems.add(tw.word());
  84. }
  85.  
  86. // Get dependency tree
  87. TreebankLanguagePack tlp = new PennTreebankLanguagePack();
  88. GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
  89. GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
  90. Collection tdl = gs.typedDependenciesCollapsed();
  91.  
  92. // And print!
  93. System.out.println("words: "+words);
  94. System.out.println("POStags: "+tags);
  95. System.out.println("stemmedWordsAndTags: "+stems);
  96. System.out.println("typedDependencies: "+tdl);
  97.  
  98. // Output Option 2: Printing out various data using TreePrint
  99.  
  100. // Various TreePrint options
  101. // "penn", // constituency parse
  102. // "oneline",
  103. // rootLabelOnlyFormat,
  104. // "words",
  105. // "wordsAndTags", // unstemmed words and pos tags
  106. // "dependencies", // unlabeled dependency parse
  107. // "typedDependencies", // dependency parse
  108. // "typedDependenciesCollapsed",
  109. // "latexTree",
  110. // "collocations",
  111. // "semanticGraph"
  112.  
  113. // Print using TreePrint with various options
  114. //TreePrint tp = new TreePrint("wordsAndTags,typedDependencies");
  115. //tp.printTree(parse);
  116.  
  117. System.out.println(); // separate output lines
  118. }
  119.  
  120. }
  121.  
  122. }
Add Comment
Please, Sign In to add comment