Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk.classify.util,os,sys
- from nltk.classify import NaiveBayesClassifier
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize,RegexpTokenizer
- import re
- TAG_RE = re.compile(r'<[^>]+>')
- def remove_tags(text):
- return TAG_RE.sub('', text)
- def word_feats(words):
- return dict([(word,True) for word in words])
- def feature_extractor(sentiment):
- path = "train/"+sentiment+"/"
- files = os.listdir(path)
- feats = {}
- i = 0
- for file in files:
- f = open(path+file,"r", encoding='utf-8')
- review = f.read()
- review = remove_tags(review)
- stopWords = (stopwords.words("english"))
- tokenizer = RegexpTokenizer(r"\w+")
- tokens = tokenizer.tokenize(review)
- features = word_feats(tokens)
- feats.update(features)
- return feats
- posative_feat = feature_extractor("pos")
- p = open("posFeat.txt","w", encoding='utf-8')
- p.write(str(posative_feat))
- negative_feat = feature_extractor("neg")
- n = open("negFeat.txt","w", encoding='utf-8')
- n.write(str(negative_feat))
- plength = int(len(posative_feat)*3/4)
- nlength = int(len(negative_feat)*3/4)
- totalLength = plength+nlength
- trainFeatList = []
- testFeatList = []
- i = 0
- for items in posative_feat.items():
- i +=1
- feature_name = 'word'
- feature = items[0]
- label = 'pos'
- value = ({feature_name:feature}, label)
- if(i<plength):
- trainFeatList.append(value)
- else:
- testFeatList.append(value)
- j = 0
- for items in negative_feat.items():
- j +=1;
- feature_name = 'word'
- feature = items[0]
- label = 'neg'
- value = ({feature_name:feature}, label)
- if(j<plength):
- trainFeatList.append(value)
- else:
- testFeatList.append(value)
- classifier = NaiveBayesClassifier.train(trainFeatList)
- print(nltk.classify.util.accuracy(classifier,testFeatList))
- classifier.show_most_informative_features()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement