Advertisement
Guest User

buggy-naive-classification-with-nltk

a guest
Jan 4th, 2017
207
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.92 KB | None | 0 0
  1. import nltk.classify.util,os,sys
  2. from nltk.classify import NaiveBayesClassifier
  3. from nltk.corpus import stopwords
  4. from nltk.tokenize  import word_tokenize,RegexpTokenizer
  5. import re
  6.  
  7. TAG_RE = re.compile(r'<[^>]+>')
  8. def remove_tags(text):
  9.     return TAG_RE.sub('', text)
  10.  
  11. def word_feats(words):
  12.     return dict([(word,True) for word in words])
  13.  
  14. def feature_extractor(sentiment):
  15.     path = "train/"+sentiment+"/"
  16.     files = os.listdir(path)
  17.     feats = {}
  18.     i = 0
  19.     for file in files:
  20.         f = open(path+file,"r", encoding='utf-8')
  21.         review = f.read()
  22.         review = remove_tags(review)
  23.         stopWords = (stopwords.words("english"))
  24.         tokenizer = RegexpTokenizer(r"\w+")
  25.         tokens = tokenizer.tokenize(review)    
  26.         features = word_feats(tokens)
  27.         feats.update(features)
  28.     return feats
  29.  
  30. posative_feat = feature_extractor("pos")
  31. p = open("posFeat.txt","w", encoding='utf-8')
  32. p.write(str(posative_feat))
  33. negative_feat = feature_extractor("neg")
  34. n = open("negFeat.txt","w", encoding='utf-8')
  35. n.write(str(negative_feat))
  36. plength = int(len(posative_feat)*3/4)
  37. nlength = int(len(negative_feat)*3/4)
  38. totalLength = plength+nlength
  39. trainFeatList = []
  40. testFeatList  = []
  41. i = 0
  42. for items in posative_feat.items():
  43.     i +=1
  44.     feature_name = 'word'
  45.     feature = items[0]
  46.     label = 'pos'
  47.     value = ({feature_name:feature}, label)
  48.     if(i<plength):
  49.         trainFeatList.append(value)
  50.     else:  
  51.         testFeatList.append(value)
  52.  
  53. j = 0
  54. for items in negative_feat.items():
  55.     j +=1;
  56.     feature_name = 'word'
  57.     feature = items[0]
  58.     label = 'neg'
  59.     value = ({feature_name:feature}, label)
  60.     if(j<plength):
  61.         trainFeatList.append(value)
  62.     else:
  63.         testFeatList.append(value)
  64.        
  65. classifier = NaiveBayesClassifier.train(trainFeatList)
  66. print(nltk.classify.util.accuracy(classifier,testFeatList))
  67. classifier.show_most_informative_features()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement