Untitled

import nltk
from Document import Document
from Classifier import Classifier
from nltk.corpus import reuters
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC, LinearSVC

import string
import re
import random
import numpy as np

categories = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn']
useNaiveBayes = False

porterStemmer = PorterStemmer()
stopSet = set(stopwords.words('english'))
punctuations = set(string.punctuation)
stopAndPuctuationSet = stopSet | punctuations

classifierIndex = {
        'earn':0,
        'acq':1,
        'money-fx':2,
        'grain':3,
        'crude':4,
        'trade':5,
        'interest':6,
        'ship':7,
        'wheat':8,
        'corn':9
    }

# print(stopAndPuctuationSet)

def checkNotANumber(word):
    obj = re.match(r'[a-z][a-z]+', word, re.I)
    if obj:
        if obj.group() == word:
            return True
    return False


def findCategory(clist):
    c = [0] * len(categories)
    for i in range(0, len(c)):
        if categories[i] in clist:
            c[i] += 1
    return c


def preprocessFile(file):
    tokens = nltk.word_tokenize(' '.join(reuters.words(file)))

    # # we should be using this one
    #category = findCategory(reuters.categories(file))


    # # but for now we can test with this
    category = [x for x in reuters.categories(file) if x in categories]


    document = Document(file, category)
    content = []
    for token in tokens:
        if token not in stopAndPuctuationSet and checkNotANumber(token):
            stemmedWord = porterStemmer.stem(token.lower())
            content.append(stemmedWord)

    document.setContent(content)
    return document


def makeDictionary(array):
    return dict([[x, "True"] for x in array])


files = reuters.fileids(categories)
# random.shuffle(files)
# files = files[:1500]

# ACQ 2369 != 1829
# TRADE 488 != 485
# CORN  237 != 238

# print(len(reuters.fileids('acq')))

trainingSet = []
testSet = []
classifierPerCategory = []


# it goes from 0 to 9...
for i in range(0,10):
    category = categories[i]

    for file in files:
        preprocessedFile = preprocessFile(file)

        categoriesFile = [x for x in reuters.categories(file) if x in categories]

        if (category in categoriesFile):
            preprocessedFile.setCategory(1)
        else:
            preprocessedFile.setCategory(0)


        index = file.find('training')
        if (index != -1):
            trainingSet.append(preprocessedFile)
        else:
            index = file.find('test')
            if (index != -1):
                testSet.append(preprocessedFile)

    training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
    testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]

    if useNaiveBayes:
        trainedClassifier = nltk.NaiveBayesClassifier.train(training)
    else:
        # trainedClassifier = SklearnClassifier(SVC()).train(training)
        trainedClassifier = SklearnClassifier(LinearSVC()).train(training)

    classifier = Classifier(training, testing, category, trainedClassifier)
    classifierPerCategory.append(classifier)

print('----Training Set----')
print(len(trainingSet))
# print(trainingSet)

print('\n')
print('----Test Set----')
print(len(testSet))
# print(testSet)


print(len(classifierPerCategory))

#print(classifierPerCategory[0]);


print(type(list(reuters.categories(testSet[0].getFileID()))))


tp = 0
fp = 0
tn = 0
fn = 0

precision = []
recall = []
f1 = []
accuracy = []

for category in categories:
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    index = int(classifierIndex.get(category))
    classifier = classifierPerCategory[index].getTrainedClassifier()

    for file in testSet:
        fileCategories = [x for x in reuters.categories(file.getFileID()) if x in categories]
        found = False

        test_file = makeDictionary(file.getContent())
        classification = classifier.classify(test_file)

        if (classification == 1 and category in fileCategories):
            tp = tp + 1
        elif (classification == 1 and category not in fileCategories):
            fp = fp + 1
        elif (classification == 0 and category in fileCategories):
            fn = fn + 1
        elif (classification == 0 and category not in fileCategories):
            tn = tn + 1

    if tp + fp == 0:
        pr = 0
    else:
        pr = float(float(tp)/(float(tp)+float(fp)))

    if tp + fn == 0:
        rec = 0
    else:
        rec = float(float(tp)/(float(tp)+float(fn)))

    if pr + rec == 0:
        f = 0
    else:
        f = float((2 * pr * rec)/(pr + rec))

    ac = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
    precision.append(pr)
    recall.append(rec)
    f1.append(f)
    accuracy.append(ac)

print(accuracy)
print(precision)
print(recall)
print(f1)

print np.mean(accuracy)
print np.mean(precision)
print np.mean(recall)
print np.mean(f1)


# print("TP : " +str(tp))
# print("TN : " +str(tn))
# print("FP : " +str(fp))
# print("FN : " +str(fn))
#
# print(float(certo)/float(len(testSet)))
#
#
# precision = float(float(tp)/(float(tp)+float(fp)))
#
# recall = float(float(tp)/(float(tp)+float(fn)))
#
# f1 = float((2 * precision * recall)/(precision + recall))
#
# print("PRECISION : " + str(precision))
# print("RECALL : " + str(recall))
# print("F1 : " + str(f1))
#
# accuracy = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
# print("Accuracy : " + str(accuracy))

        #print("Classificador Classe " + str(i) + " , " + categories[i])
        #classifier = nltk.NaiveBayesClassifier.train(classifierPerCategory[i].getTrainingSet())
        #print(nltk.NaiveBayesClassifier.classify(classifier,classifierPerCategory[i].getTestSet()))
        #print nltk.classify.accuracy(classifier, classifierPerCategory[i].getTestSet())


#training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
#testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]

## naive bayes
#classifier = nltk.NaiveBayesClassifier.train(training)
#print nltk.classify.accuracy(classifier, testing)