Advertisement
Guest User

Untitled

a guest
Apr 20th, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.11 KB | None | 0 0
  1. package corenlp.process;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.IOException;
  5. import java.io.PrintWriter;
  6. import java.util.ArrayList;
  7. import java.util.List;
  8.  
  9. import edu.stanford.nlp.ling.CoreLabel;
  10. import edu.stanford.nlp.parser.nndep.DependencyParser;
  11. import edu.stanford.nlp.trees.GrammaticalStructure;
  12. import edu.stanford.nlp.trees.TypedDependency;
  13. import statistics.RAWF;
  14.  
  15. public class Converter {
  16.  
  17.  
  18. String modelPath ;
  19.  
  20. public Converter(String modelPath) {
  21. this.modelPath = modelPath;
  22.  
  23. }
  24.  
  25. /**
  26. * Read CoNLL-2003
  27. * @param path
  28. * @param writePath
  29. * @throws IOException
  30. */
  31. public void readData(String path, String writePath) throws IOException{
  32. DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
  33.  
  34. BufferedReader br = RAWF.reader(path);
  35. PrintWriter pw = RAWF.writer(writePath);
  36. String line = null;
  37. List<CoreLabel> words = new ArrayList<>();
  38. ArrayList<String> output = new ArrayList<String>();
  39. while((line = br.readLine())!=null){
  40. if(line.equals("")){
  41.  
  42. GrammaticalStructure gs = parser.predict(words);
  43. List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
  44. int[] heads = new int[words.size()];
  45. String[] depLabels = new String[words.size()];
  46. for (TypedDependency dep : deps) {
  47. heads[dep.dep().index() - 1] = dep.gov().index() - 1;
  48. depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
  49. // System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
  50. }
  51. // System.exit(0);
  52.  
  53. for (int p = 0; p < words.size(); p++) {
  54. CoreLabel word = words.get(p);
  55. int head = heads[p] + 1;
  56. pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
  57. }
  58. pw.println();
  59. words = new ArrayList<CoreLabel>();
  60. output = new ArrayList<String>();
  61. } else {
  62. String[] values = line.split(" ");
  63. String entity = values[2];
  64. output.add(entity);
  65. CoreLabel token = new CoreLabel();
  66. token.setWord(values[0]);
  67. token.setTag(values[1]);
  68. words.add(token);
  69. }
  70. }
  71. br.close();
  72. pw.close();
  73. }
  74.  
  75. /**
  76. * Read CoNLL-2003
  77. * @param path
  78. * @param writePath
  79. * @throws IOException
  80. */
  81. public void readOntoNotes(String path, String writePath) throws IOException{
  82. DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
  83.  
  84. BufferedReader br = RAWF.reader(path);
  85. PrintWriter pw = RAWF.writer(writePath);
  86. String line = null;
  87. List<CoreLabel> words = new ArrayList<>();
  88. ArrayList<String> output = new ArrayList<String>();
  89. while((line = br.readLine())!=null){
  90. if(line.equals("")){
  91.  
  92. GrammaticalStructure gs = parser.predict(words);
  93. List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
  94. int[] heads = new int[words.size()];
  95. String[] depLabels = new String[words.size()];
  96. for (TypedDependency dep : deps) {
  97. heads[dep.dep().index() - 1] = dep.gov().index() - 1;
  98. depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
  99. // System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
  100. }
  101. // System.exit(0);
  102.  
  103. for (int p = 0; p < words.size(); p++) {
  104. CoreLabel word = words.get(p);
  105. int head = heads[p] + 1;
  106. pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
  107. }
  108. pw.println();
  109. words = new ArrayList<CoreLabel>();
  110. output = new ArrayList<String>();
  111. } else {
  112. String[] values = line.split("\t");
  113. String word = values[1];
  114. String pos = values[3];
  115. String entity = values[values.length - 1];
  116. output.add(entity);
  117. CoreLabel token = new CoreLabel();
  118. token.setWord(word);
  119. token.setTag(pos);
  120. words.add(token);
  121. }
  122. }
  123. br.close();
  124. pw.close();
  125. }
  126.  
  127. public static void main(String... args) throws IOException {
  128.  
  129.  
  130.  
  131. String[] x = new String[]{"SD", "UD"};
  132. for(String type : x) {
  133. String path = "edu/stanford/nlp/models/parser/nndep/english_"+type+".gz";
  134. Converter conv = new Converter(path);
  135. // conv.readData("data/conll2003/train.txt", "data/conll2003/train."+type.toLowerCase()+".conllx");
  136. // conv.readData("data/conll2003/dev.txt", "data/conll2003/dev."+type.toLowerCase()+".conllx");
  137. // conv.readData("data/conll2003/test.txt", "data/conll2003/test."+type.toLowerCase()+".conllx");
  138.  
  139. conv.readOntoNotes("data/ontonotes/train.sd.conllx", "data/ontonotes/train.pred"+type.toLowerCase()+".conllx");
  140. conv.readOntoNotes("data/ontonotes/dev.sd.conllx", "data/ontonotes/dev.pred"+type.toLowerCase()+".conllx");
  141. conv.readOntoNotes("data/ontonotes/test.sd.conllx", "data/ontonotes/test.pred"+type.toLowerCase()+".conllx");
  142. }
  143.  
  144. }
  145.  
  146. // public static void main(String... args) {
  147. // String modelPath = DependencyParser.DEFAULT_MODEL;
  148. // String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
  149. //
  150. // for (int argIndex = 0; argIndex < args.length; ) {
  151. // switch (args[argIndex]) {
  152. // case "-tagger":
  153. // taggerPath = args[argIndex + 1];
  154. // argIndex += 2;
  155. // break;
  156. // case "-model":
  157. // modelPath = args[argIndex + 1];
  158. // argIndex += 2;
  159. // break;
  160. // default:
  161. // throw new RuntimeException("Unknown argument " + args[argIndex]);
  162. // }
  163. // }
  164. //
  165. // String text = "I can almost always tell when movies use fake dinosaurs.";
  166. //
  167. // MaxentTagger tagger = new MaxentTagger(taggerPath);
  168. //
  169. //
  170. //// DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
  171. //// for (List<HasWord> sentence : tokenizer) {
  172. //// List<TaggedWord> tagged = tagger.tagSentence(sentence);
  173. //// GrammaticalStructure gs = parser.predict(tagged);
  174. //// System.out.println(gs.toString());
  175. //// }
  176. //
  177. // CoreLabel token = new CoreLabel();
  178. // token.setWord("I");
  179. // token.setTag("NN");
  180. // List<CoreLabel> sent = new ArrayList<>();
  181. // sent.add(token);
  182. // GrammaticalStructure gs = parser.predict(sent);
  183. // System.out.println(gs.toString());
  184. // }
  185. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement