Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package corenlp.process;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.PrintWriter;
- import java.util.ArrayList;
- import java.util.List;
- import edu.stanford.nlp.ling.CoreLabel;
- import edu.stanford.nlp.parser.nndep.DependencyParser;
- import edu.stanford.nlp.trees.GrammaticalStructure;
- import edu.stanford.nlp.trees.TypedDependency;
- import statistics.RAWF;
- public class Converter {
- String modelPath ;
- public Converter(String modelPath) {
- this.modelPath = modelPath;
- }
- /**
- * Read CoNLL-2003
- * @param path
- * @param writePath
- * @throws IOException
- */
- public void readData(String path, String writePath) throws IOException{
- DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
- BufferedReader br = RAWF.reader(path);
- PrintWriter pw = RAWF.writer(writePath);
- String line = null;
- List<CoreLabel> words = new ArrayList<>();
- ArrayList<String> output = new ArrayList<String>();
- while((line = br.readLine())!=null){
- if(line.equals("")){
- GrammaticalStructure gs = parser.predict(words);
- List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
- int[] heads = new int[words.size()];
- String[] depLabels = new String[words.size()];
- for (TypedDependency dep : deps) {
- heads[dep.dep().index() - 1] = dep.gov().index() - 1;
- depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
- // System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
- }
- // System.exit(0);
- for (int p = 0; p < words.size(); p++) {
- CoreLabel word = words.get(p);
- int head = heads[p] + 1;
- pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
- }
- pw.println();
- words = new ArrayList<CoreLabel>();
- output = new ArrayList<String>();
- } else {
- String[] values = line.split(" ");
- String entity = values[2];
- output.add(entity);
- CoreLabel token = new CoreLabel();
- token.setWord(values[0]);
- token.setTag(values[1]);
- words.add(token);
- }
- }
- br.close();
- pw.close();
- }
- /**
- * Read CoNLL-2003
- * @param path
- * @param writePath
- * @throws IOException
- */
- public void readOntoNotes(String path, String writePath) throws IOException{
- DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
- BufferedReader br = RAWF.reader(path);
- PrintWriter pw = RAWF.writer(writePath);
- String line = null;
- List<CoreLabel> words = new ArrayList<>();
- ArrayList<String> output = new ArrayList<String>();
- while((line = br.readLine())!=null){
- if(line.equals("")){
- GrammaticalStructure gs = parser.predict(words);
- List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
- int[] heads = new int[words.size()];
- String[] depLabels = new String[words.size()];
- for (TypedDependency dep : deps) {
- heads[dep.dep().index() - 1] = dep.gov().index() - 1;
- depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
- // System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
- }
- // System.exit(0);
- for (int p = 0; p < words.size(); p++) {
- CoreLabel word = words.get(p);
- int head = heads[p] + 1;
- pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
- }
- pw.println();
- words = new ArrayList<CoreLabel>();
- output = new ArrayList<String>();
- } else {
- String[] values = line.split("\t");
- String word = values[1];
- String pos = values[3];
- String entity = values[values.length - 1];
- output.add(entity);
- CoreLabel token = new CoreLabel();
- token.setWord(word);
- token.setTag(pos);
- words.add(token);
- }
- }
- br.close();
- pw.close();
- }
- public static void main(String... args) throws IOException {
- String[] x = new String[]{"SD", "UD"};
- for(String type : x) {
- String path = "edu/stanford/nlp/models/parser/nndep/english_"+type+".gz";
- Converter conv = new Converter(path);
- // conv.readData("data/conll2003/train.txt", "data/conll2003/train."+type.toLowerCase()+".conllx");
- // conv.readData("data/conll2003/dev.txt", "data/conll2003/dev."+type.toLowerCase()+".conllx");
- // conv.readData("data/conll2003/test.txt", "data/conll2003/test."+type.toLowerCase()+".conllx");
- conv.readOntoNotes("data/ontonotes/train.sd.conllx", "data/ontonotes/train.pred"+type.toLowerCase()+".conllx");
- conv.readOntoNotes("data/ontonotes/dev.sd.conllx", "data/ontonotes/dev.pred"+type.toLowerCase()+".conllx");
- conv.readOntoNotes("data/ontonotes/test.sd.conllx", "data/ontonotes/test.pred"+type.toLowerCase()+".conllx");
- }
- }
- // public static void main(String... args) {
- // String modelPath = DependencyParser.DEFAULT_MODEL;
- // String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
- //
- // for (int argIndex = 0; argIndex < args.length; ) {
- // switch (args[argIndex]) {
- // case "-tagger":
- // taggerPath = args[argIndex + 1];
- // argIndex += 2;
- // break;
- // case "-model":
- // modelPath = args[argIndex + 1];
- // argIndex += 2;
- // break;
- // default:
- // throw new RuntimeException("Unknown argument " + args[argIndex]);
- // }
- // }
- //
- // String text = "I can almost always tell when movies use fake dinosaurs.";
- //
- // MaxentTagger tagger = new MaxentTagger(taggerPath);
- //
- //
- //// DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
- //// for (List<HasWord> sentence : tokenizer) {
- //// List<TaggedWord> tagged = tagger.tagSentence(sentence);
- //// GrammaticalStructure gs = parser.predict(tagged);
- //// System.out.println(gs.toString());
- //// }
- //
- // CoreLabel token = new CoreLabel();
- // token.setWord("I");
- // token.setTag("NN");
- // List<CoreLabel> sent = new ArrayList<>();
- // sent.add(token);
- // GrammaticalStructure gs = parser.predict(sent);
- // System.out.println(gs.toString());
- // }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement