Advertisement
Guest User

Untitled

a guest
Sep 29th, 2016
56
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.38 KB | None | 0 0
  1. import nltk
  2. from Document import Document
  3. from Classifier import Classifier
  4. from nltk.corpus import reuters
  5. from nltk.stem.porter import PorterStemmer
  6. from nltk.corpus import stopwords
  7. from nltk.classify.scikitlearn import SklearnClassifier
  8. from sklearn.svm import SVC, LinearSVC
  9.  
  10. import string
  11. import re
  12. import random
  13. import numpy as np
  14.  
  15. categories = ['earn', 'acq', 'money-fx', 'grain', 'crude', 'trade', 'interest', 'ship', 'wheat', 'corn']
  16. useNaiveBayes = False
  17.  
  18. porterStemmer = PorterStemmer()
  19. stopSet = set(stopwords.words('english'))
  20. punctuations = set(string.punctuation)
  21. stopAndPuctuationSet = stopSet | punctuations
  22.  
  23. classifierIndex = {
  24. 'earn':0,
  25. 'acq':1,
  26. 'money-fx':2,
  27. 'grain':3,
  28. 'crude':4,
  29. 'trade':5,
  30. 'interest':6,
  31. 'ship':7,
  32. 'wheat':8,
  33. 'corn':9
  34. }
  35.  
  36. # print(stopAndPuctuationSet)
  37.  
  38. def checkNotANumber(word):
  39. obj = re.match(r'[a-z][a-z]+', word, re.I)
  40. if obj:
  41. if obj.group() == word:
  42. return True
  43. return False
  44.  
  45.  
  46. def findCategory(clist):
  47. c = [0] * len(categories)
  48. for i in range(0, len(c)):
  49. if categories[i] in clist:
  50. c[i] += 1
  51. return c
  52.  
  53.  
  54. def preprocessFile(file):
  55. tokens = nltk.word_tokenize(' '.join(reuters.words(file)))
  56.  
  57. # # we should be using this one
  58. #category = findCategory(reuters.categories(file))
  59.  
  60.  
  61. # # but for now we can test with this
  62. category = [x for x in reuters.categories(file) if x in categories]
  63.  
  64.  
  65.  
  66. document = Document(file, category)
  67. content = []
  68. for token in tokens:
  69. if token not in stopAndPuctuationSet and checkNotANumber(token):
  70. stemmedWord = porterStemmer.stem(token.lower())
  71. content.append(stemmedWord)
  72.  
  73. document.setContent(content)
  74. return document
  75.  
  76.  
  77. def makeDictionary(array):
  78. return dict([[x, "True"] for x in array])
  79.  
  80.  
  81. files = reuters.fileids(categories)
  82. # random.shuffle(files)
  83. # files = files[:1500]
  84.  
  85. # ACQ 2369 != 1829
  86. # TRADE 488 != 485
  87. # CORN 237 != 238
  88.  
  89. # print(len(reuters.fileids('acq')))
  90.  
  91. trainingSet = []
  92. testSet = []
  93. classifierPerCategory = []
  94.  
  95.  
  96. # it goes from 0 to 9...
  97. for i in range(0,10):
  98. category = categories[i]
  99.  
  100. for file in files:
  101. preprocessedFile = preprocessFile(file)
  102.  
  103. categoriesFile = [x for x in reuters.categories(file) if x in categories]
  104.  
  105. if (category in categoriesFile):
  106. preprocessedFile.setCategory(1)
  107. else:
  108. preprocessedFile.setCategory(0)
  109.  
  110.  
  111. index = file.find('training')
  112. if (index != -1):
  113. trainingSet.append(preprocessedFile)
  114. else:
  115. index = file.find('test')
  116. if (index != -1):
  117. testSet.append(preprocessedFile)
  118.  
  119. training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
  120. testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]
  121.  
  122. if useNaiveBayes:
  123. trainedClassifier = nltk.NaiveBayesClassifier.train(training)
  124. else:
  125. # trainedClassifier = SklearnClassifier(SVC()).train(training)
  126. trainedClassifier = SklearnClassifier(LinearSVC()).train(training)
  127.  
  128. classifier = Classifier(training, testing, category, trainedClassifier)
  129. classifierPerCategory.append(classifier)
  130.  
  131. print('----Training Set----')
  132. print(len(trainingSet))
  133. # print(trainingSet)
  134.  
  135. print('\n')
  136. print('----Test Set----')
  137. print(len(testSet))
  138. # print(testSet)
  139.  
  140.  
  141. print(len(classifierPerCategory))
  142.  
  143. #print(classifierPerCategory[0]);
  144.  
  145.  
  146. print(type(list(reuters.categories(testSet[0].getFileID()))))
  147.  
  148.  
  149. tp = 0
  150. fp = 0
  151. tn = 0
  152. fn = 0
  153.  
  154. precision = []
  155. recall = []
  156. f1 = []
  157. accuracy = []
  158.  
  159. for category in categories:
  160. tp = 0
  161. fp = 0
  162. tn = 0
  163. fn = 0
  164. index = int(classifierIndex.get(category))
  165. classifier = classifierPerCategory[index].getTrainedClassifier()
  166.  
  167. for file in testSet:
  168. fileCategories = [x for x in reuters.categories(file.getFileID()) if x in categories]
  169. found = False
  170.  
  171. test_file = makeDictionary(file.getContent())
  172. classification = classifier.classify(test_file)
  173.  
  174. if (classification == 1 and category in fileCategories):
  175. tp = tp + 1
  176. elif (classification == 1 and category not in fileCategories):
  177. fp = fp + 1
  178. elif (classification == 0 and category in fileCategories):
  179. fn = fn + 1
  180. elif (classification == 0 and category not in fileCategories):
  181. tn = tn + 1
  182.  
  183. if tp + fp == 0:
  184. pr = 0
  185. else:
  186. pr = float(float(tp)/(float(tp)+float(fp)))
  187.  
  188. if tp + fn == 0:
  189. rec = 0
  190. else:
  191. rec = float(float(tp)/(float(tp)+float(fn)))
  192.  
  193. if pr + rec == 0:
  194. f = 0
  195. else:
  196. f = float((2 * pr * rec)/(pr + rec))
  197.  
  198. ac = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
  199. precision.append(pr)
  200. recall.append(rec)
  201. f1.append(f)
  202. accuracy.append(ac)
  203.  
  204. print(accuracy)
  205. print(precision)
  206. print(recall)
  207. print(f1)
  208.  
  209. print np.mean(accuracy)
  210. print np.mean(precision)
  211. print np.mean(recall)
  212. print np.mean(f1)
  213.  
  214.  
  215.  
  216. # print("TP : " +str(tp))
  217. # print("TN : " +str(tn))
  218. # print("FP : " +str(fp))
  219. # print("FN : " +str(fn))
  220. #
  221. # print(float(certo)/float(len(testSet)))
  222. #
  223. #
  224. # precision = float(float(tp)/(float(tp)+float(fp)))
  225. #
  226. # recall = float(float(tp)/(float(tp)+float(fn)))
  227. #
  228. # f1 = float((2 * precision * recall)/(precision + recall))
  229. #
  230. # print("PRECISION : " + str(precision))
  231. # print("RECALL : " + str(recall))
  232. # print("F1 : " + str(f1))
  233. #
  234. # accuracy = float((float(tp)+ float(tn))/(float(tp)+float(tn)+float(fp)+float(fn)))
  235. # print("Accuracy : " + str(accuracy))
  236.  
  237. #print("Classificador Classe " + str(i) + " , " + categories[i])
  238. #classifier = nltk.NaiveBayesClassifier.train(classifierPerCategory[i].getTrainingSet())
  239. #print(nltk.NaiveBayesClassifier.classify(classifier,classifierPerCategory[i].getTestSet()))
  240. #print nltk.classify.accuracy(classifier, classifierPerCategory[i].getTestSet())
  241.  
  242.  
  243.  
  244. #training = [[makeDictionary(x.getContent()), x.getCategory()] for x in trainingSet]
  245. #testing = [[makeDictionary(x.getContent()), x.getCategory()] for x in testSet]
  246.  
  247. ## naive bayes
  248. #classifier = nltk.NaiveBayesClassifier.train(training)
  249. #print nltk.classify.accuracy(classifier, testing)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement