import com.aliasi.chunk.AbstractCharLmRescoringChunker;
import com.aliasi.chunk.BioTagChunkCodec;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.TagChunkCodec;
import com.aliasi.tag.Tagging;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.AbstractExternalizable;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.String;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.StringTokenizer;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.mutable.MutableInt;
public class test {
@SuppressWarnings("rawtypes")
public static void main(String[] args) throws Exception {
long startTime = System.currentTimeMillis();
String basePath = "/Users/davide/Documents/MPhil ACS/Machine Learning/ML Project/";
File modelFolder = new File(basePath + "Models/");
File dataFormat = new File(basePath + "Charbased model/DataFormat.raw");
File dataFile = new File(basePath + "Charbased model/Data/sampletest1.raw");
File evalScript = new File(basePath + "Dataset/Genia4ERtraining/evalIOB2.pl");
File Solution = new File(basePath + "Dataset/Genia4ERtraining/sampletest1.iob2");
String evalFolder = basePath + "Evaluations/";
String writingloc = "";
File[] models;
if (modelFolder.isDirectory()) {
models = modelFolder.listFiles();
} else {
models = new File[] {modelFolder};
}
int currfile = 0;
int totfiles = models.length;
for (File model : models) {
currfile++;
if (model.getName().equals(".DS_Store")) continue;
System.out.println(model);
AbstractCharLmRescoringChunker chunker = (AbstractCharLmRescoringChunker) AbstractExternalizable.readObject(model);
TokenizerFactory tokenizerFactory
= IndoEuropeanTokenizerFactory.INSTANCE;
boolean enforceConsistency = true;
TagChunkCodec tagChunkCodec
= new BioTagChunkCodec(tokenizerFactory,
enforceConsistency);
//System.out.println();
FileWriter myWriter = new FileWriter(basePath + "TestResults" + model.getName() + ".iob2");
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(dataFile)));
BufferedReader formatreader = new BufferedReader(new InputStreamReader(
new FileInputStream(dataFormat)));
String sentence, formatWord;
String formatSentence;
while ((sentence = reader.readLine()) != null) {
formatSentence = formatreader.readLine();
if (StringUtils.isEmpty(sentence)) {
myWriter.write("\n");
continue;
}
Chunking chunking = chunker.chunk(sentence); // Reading the line
Tagging<String> tagged = tagChunkCodec.toTagging(chunking);
String tg = tagged.toString();
tg = tg.replaceAll(" ", "\n");
tg = tg.replaceAll("(.*?)/([IB]_[^ \n]+|O)", "$1\t$2"); // Replace all slashes in tags, leave the others
tg = tg.replaceAll("\t([IB])_(.*)", "\t$1-$2"); // Replace underscores w/ dashes in tags
// -------------------------------------
StringTokenizer st = new StringTokenizer(formatSentence);
String result = "";
MutableInt pos = new MutableInt(tg.indexOf("\t"));
String curWord = tg.substring(0, pos.intValue());
String firstTag = get_next_tag(tg, pos.intValue());
while (st.hasMoreTokens()) {
formatWord = st.nextToken();
while (! formatWord.equals(curWord)) {
curWord += get_next_word(tg, pos);
}
result += curWord + "\t" + firstTag+"\n";
curWord = get_next_word(tg, pos);
firstTag = get_next_tag(tg, pos.intValue());
}
result.replaceAll("\t\t", "\t");
result.replaceAll("[.]\t\n", "[.]\tO\n");
result.replaceAll("(.*)\t([IB]-(.*))()\n\\(\tO$()\n(.*)\t(.*)\n\\)\tO", "$1\t$2\n\\(\tI-$3\n$6\tI-$3\n\\)\tI-$3");
myWriter.write(result + "\n");
result = "";
}
myWriter.flush();
writingloc = basePath + "TestResults" + model.getName() + ".iob2";
//System.out.println("\nWRITINGLOC: " + writingloc + "\n");
myWriter.close();
String params[] = {
"perl", evalScript.toString(), Solution.toString(), writingloc
};
try {
final Process proc = Runtime.getRuntime().exec(params);
try {
proc.waitFor();
} catch (final InterruptedException e) {
e.printStackTrace();
}
final BufferedReader outputReader = new BufferedReader(new InputStreamReader(proc
.getInputStream()));
final BufferedReader errorReader = new BufferedReader(new InputStreamReader(proc
.getErrorStream()));
FileWriter evalFile = new FileWriter(evalFolder + model.getName() + ".txt");
String line;
while ((line = outputReader.readLine()) != null) {
evalFile.write(line);
evalFile.write("\n");
// System.out.println(line); // Works perfectly
}
while ((line = errorReader.readLine()) != null) {
System.err.println(line);
}
} catch (final IOException e) {
e.printStackTrace();
}
}
//ProcessBuilder pb = new ProcessBuilder(params);
//Process p = pb.start();
//int exitCode = p.waitFor();
//}
long endTime = System.currentTimeMillis();
NumberFormat formatter = new DecimalFormat("#0.00000");
System.out.print("All done. Execution time is " + formatter.format((endTime - startTime) / 1000d) + " seconds");
} // main end
public static String get_next_word(String source, MutableInt pos) {
int a = source.indexOf("\n", pos.intValue());
int b = source.indexOf("\t", a+1);
if (b == -1) return "";
pos.setValue(b+1);
return source.substring(a+1, b);
}
public static String get_next_tag(String source, int pos) {
//int a = source.indexOf("\t", pos);
int a = pos;
int b = source.indexOf("\n", a+1);
if (b == -1) return "";
return source.substring(a, b);
}
} // class end