Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from Document import Document
- from Classifier import Classifier
- from nltk.corpus import reuters
- from nltk.stem.porter import PorterStemmer
- from nltk.corpus import stopwords
- from nltk.classify.scikitlearn import SklearnClassifier
- from sklearn.svm import SVC, LinearSVC
- import string
- import re
- import random
- import numpy as np
- categories = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn']
- useNaiveBayes = False
- porterStemmer = PorterStemmer()
- stopSet = set(stopwords.words('english'))
- punctuations = set(string.punctuation)
- stopAndPuctuationSet = stopSet | punctuations
- classifierIndex = {
- 'earn':0,
- 'acq':1,
- 'money-fx':2,
- 'grain':3,
- 'crude':4,
- 'trade':5,
- 'interest':6,
- 'ship':7,
- 'wheat':8,
- 'corn':9
- }
- # print(stopAndPuctuationSet)
- def checkNotANumber(word):
- obj = re.match(r'[a-z][a-z]+', word, re.I)
- if obj:
- if obj.group() == word:
- return True
- return False
- def findCategory(clist):
- c = [0] * len(categories)
- for i in range(0, len(c)):
- if categories[i] in clist:
- c[i] += 1
- return c
- def preprocessFile(file):
- tokens = nltk.word_tokenize(' '.join(reuters.words(file)))
- # # we should be using this one
- #category = findCategory(reuters.categories(file))
- # # but for now we can test with this
- category = [x for x in reuters.categories(file) if x in categories]
- document = Document(file, category)
- content = []
- for token in tokens:
- if token not in stopAndPuctuationSet and checkNotANumber(token):
- stemmedWord = porterStemmer.stem(token.lower())
- content.append(stemmedWord)
- document.setContent(content)
- return document
- def makeDictionary(array):
- return dict([[x, "True"] for x in array])
- files = reuters.fileids(categories)
- # random.shuffle(files)
- # files = files[:1500]
- # ACQ 2369 != 1829
- # TRADE 488 != 485
- # CORN 237 != 238
- # print(len(reuters.fileids('acq')))
- trainingSet = []
- testSet = []
- classifierPerCategory = []
- # it goes from 0 to 9...
- for i in range(0,10):
- category = categories[i]
- for file in files:
- preprocessedFile = preprocessFile(file)
- categoriesFile = [x for x in reuters.categories(file) if x in categories]
- if (category in categoriesFile):
- preprocessedFile.setCategory(1)
- else:
- preprocessedFile.setCategory(0)
- index = file.find('training')
- if (index != -1):
- trainingSet.append(preprocessedFile)
- else:
- index = file.find('test')
- if (index != -1):
- testSet.append(preprocessedFile)
- training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
- testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]
- if useNaiveBayes:
- trainedClassifier = nltk.NaiveBayesClassifier.train(training)
- else:
- # trainedClassifier = SklearnClassifier(SVC()).train(training)
- trainedClassifier = SklearnClassifier(LinearSVC()).train(training)
- classifier = Classifier(training, testing, category, trainedClassifier)
- classifierPerCategory.append(classifier)
- print('----Training Set----')
- print(len(trainingSet))
- # print(trainingSet)
- print('\n')
- print('----Test Set----')
- print(len(testSet))
- # print(testSet)
- print(len(classifierPerCategory))
- #print(classifierPerCategory[0]);
- print(type(list(reuters.categories(testSet[0].getFileID()))))
- tp = 0
- fp = 0
- tn = 0
- fn = 0
- precision = []
- recall = []
- f1 = []
- accuracy = []
- for category in categories:
- tp = 0
- fp = 0
- tn = 0
- fn = 0
- index = int(classifierIndex.get(category))
- classifier = classifierPerCategory[index].getTrainedClassifier()
- for file in testSet:
- fileCategories = [x for x in reuters.categories(file.getFileID()) if x in categories]
- found = False
- test_file = makeDictionary(file.getContent())
- classification = classifier.classify(test_file)
- if (classification == 1 and category in fileCategories):
- tp = tp + 1
- elif (classification == 1 and category not in fileCategories):
- fp = fp + 1
- elif (classification == 0 and category in fileCategories):
- fn = fn + 1
- elif (classification == 0 and category not in fileCategories):
- tn = tn + 1
- if tp + fp == 0:
- pr = 0
- else:
- pr = float(float(tp)/(float(tp)+float(fp)))
- if tp + fn == 0:
- rec = 0
- else:
- rec = float(float(tp)/(float(tp)+float(fn)))
- if pr + rec == 0:
- f = 0
- else:
- f = float((2 * pr * rec)/(pr + rec))
- ac = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
- precision.append(pr)
- recall.append(rec)
- f1.append(f)
- accuracy.append(ac)
- print(accuracy)
- print(precision)
- print(recall)
- print(f1)
- print np.mean(accuracy)
- print np.mean(precision)
- print np.mean(recall)
- print np.mean(f1)
- # print("TP : " +str(tp))
- # print("TN : " +str(tn))
- # print("FP : " +str(fp))
- # print("FN : " +str(fn))
- #
- # print(float(certo)/float(len(testSet)))
- #
- #
- # precision = float(float(tp)/(float(tp)+float(fp)))
- #
- # recall = float(float(tp)/(float(tp)+float(fn)))
- #
- # f1 = float((2 * precision * recall)/(precision + recall))
- #
- # print("PRECISION : " + str(precision))
- # print("RECALL : " + str(recall))
- # print("F1 : " + str(f1))
- #
- # accuracy = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
- # print("Accuracy : " + str(accuracy))
- #print("Classificador Classe " + str(i) + " , " + categories[i])
- #classifier = nltk.NaiveBayesClassifier.train(classifierPerCategory[i].getTrainingSet())
- #print(nltk.NaiveBayesClassifier.classify(classifier,classifierPerCategory[i].getTestSet()))
- #print nltk.classify.accuracy(classifier, classifierPerCategory[i].getTestSet())
- #training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
- #testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]
- ## naive bayes
- #classifier = nltk.NaiveBayesClassifier.train(training)
- #print nltk.classify.accuracy(classifier, testing)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement