Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.nio.MappedByteBuffer;
- import java.nio.channels.FileChannel;
- import java.nio.charset.Charset;
- import java.util.ArrayList;
- import java.util.List;
- import weka.classifiers.functions.LibSVM;
- import weka.classifiers.functions.SMO;
- import weka.classifiers.functions.supportVector.PolyKernel;
- import weka.core.Attribute;
- import weka.core.Instance;
- import weka.core.Instances;
- import weka.core.SerializationHelper;
- import weka.core.converters.TextDirectoryLoader;
- import weka.core.tokenizers.NGramTokenizer;
- import weka.filters.Filter;
- import weka.filters.unsupervised.attribute.StringToWordVector;
- public class Finalizer {
- private static SMO svm;
- private static Instances data;
- private static StringToWordVector vectoriser;
- public Finalizer() throws Exception {
- // this.svm = (LibSVM) SerializationHelper.read("SVM_NG_2_Movie.model");
- this.svm = (SMO) SerializationHelper.read("SVM_NG_2_1_Movie.model");
- this.data = (Instances) SerializationHelper.read("InstSMO.bin");
- this.vectoriser = vectorizer();
- }
- public static void buildSVMClassifier(String strDirPath) throws Exception {
- NGramTokenizer ngt = new NGramTokenizer();
- ngt.setNGramMaxSize(2);
- TextDirectoryLoader tdl = new TextDirectoryLoader();
- tdl.setDirectory(new File(strDirPath));
- Instances instances = tdl.getDataSet();
- StringToWordVector stwv = new StringToWordVector();
- stwv.setTokenizer(ngt);
- stwv.setTFTransform(true);
- stwv.setIDFTransform(true);
- stwv.setUseStoplist(true);
- stwv.setLowerCaseTokens(true);
- stwv.setInputFormat(instances);
- Instances filterdInstances = Filter.useFilter(instances, stwv);
- /*
- * LibSVM svm = new LibSVM(); SelectedTag kt = new SelectedTag(0,
- * LibSVM.TAGS_KERNELTYPE); SelectedTag svmt = new SelectedTag(0,
- * LibSVM.TAGS_SVMTYPE); svm.setKernelType(kt); svm.setSVMType(svmt);
- * svm.setProbabilityEstimates(true);
- * svm.buildClassifier(filterdInstances);
- */
- SMO smosvm = new SMO();
- // RBFKernel kernal = new RBFKernel();
- PolyKernel kernal = new PolyKernel();
- smosvm.setKernel(kernal);
- smosvm.buildClassifier(filterdInstances);
- SerializationHelper sh = new SerializationHelper();
- sh.write("SVM_NG_2_1_Movie.model", svm);
- sh.write("InstSMO.bin", instances);
- }
- public static StringToWordVector vectorizer() throws Exception {
- NGramTokenizer ngt = new NGramTokenizer();
- ngt.setNGramMaxSize(2);
- StringToWordVector stwv = new StringToWordVector();
- stwv.setTokenizer(ngt);
- stwv.setTFTransform(true);
- stwv.setIDFTransform(true);
- stwv.setUseStoplist(true);
- stwv.setLowerCaseTokens(true);
- // stwv.setInputFormat(data);
- return stwv;
- }
- public static Instance makeInstance(String text, Instances data) {
- int cIdx = data.numAttributes() - 1;
- data.setClassIndex(cIdx);
- Instance instance = new Instance(2);
- Attribute messageAtt = data.attribute("text");
- instance.setValue(messageAtt, messageAtt.addStringValue(text));
- instance.setDataset(data);
- instance.setClassMissing();
- return instance;
- }
- public static String classify(String msg) throws Exception {
- // Filter.useFilter(data, vectoriser);
- Instance inst = makeInstance(msg, data);
- vectoriser.setInputFormat(data);
- Filter.useFilter(data, vectoriser);
- // data.
- vectoriser.input(inst);
- // vectoriser.setInputFormat(data);
- Instance filteredInstance = vectoriser.output();
- filteredInstance.setClassMissing();
- //double predicted = 0;
- String label = "uk";
- try {
- double predicted = svm.classifyInstance(filteredInstance);
- label = data.classAttribute().value((int) predicted);
- } catch (Exception e) {
- e.printStackTrace();
- }
- // System.out.println(predicted);
- //String label = data.classAttribute().value((int) predicted);
- return label;
- }
- // /////////For testing only
- public static List<String> listFiles(String pathF) {
- String path = pathF;
- String files;
- File folder = new File(path);
- File[] listOfFiles = folder.listFiles();
- List<String> fileList = new ArrayList<String>();
- for (int i = 0; i < listOfFiles.length; i++) {
- if (listOfFiles[i].isFile()) {
- files = listOfFiles[i].getAbsolutePath().toString();
- if (files.endsWith(".txt")) {
- // System.out.println(files);
- fileList.add(files);
- }
- }
- }
- return fileList;
- }
- // ///////////////////
- // //
- private static String readFile(String path) throws IOException {
- FileInputStream stream = new FileInputStream(new File(path));
- try {
- FileChannel fc = stream.getChannel();
- MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0,
- fc.size());
- return Charset.defaultCharset().decode(bb).toString();
- } finally {
- stream.close();
- }
- }
- public static String extractFromTextFile(String FilePath)
- throws IOException {
- return readFile(FilePath);
- }
- // /////////////////////////////
- public static void main(String[] args) throws Exception {
- // String dp = "/usr/share/nltk_data/corpora/movie_reviews";
- // buildSVMClassifier(dp);
- String dir = "/home/u179995/nltk_data/pos";
- List<String> files = listFiles(dir);
- Finalizer classifier = new Finalizer();
- int file = 0;
- int type = 0;
- int typeNeg = 0;
- for (String string : files) {
- file += 1;
- System.out.println(string);
- String content = extractFromTextFile(string);
- String label = classifier.classify(content);
- System.out.println("L : " + label + " :POS");
- if (label.equalsIgnoreCase("pos")) {
- type += 1;
- } else {
- typeNeg += 1;
- }
- }
- System.out.println("NEG :" + type);
- System.out.println("NEG AS POS :" + typeNeg);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement