Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.tokenize import word_tokenize as wtok, sent_tokenize as stok #tokenizing sentence
- from nltk.corpus import stopwords as sw, wordnet #filtering sentence
- from nltk.stem import PorterStemmer as ps, WordNetLemmatizer as wnl #stem
- from nltk.probability import FreqDist #stem filter to basic
- from nltk.chunk import ne_chunk #show frequency of words
- from nltk.tag import pos_tag #detail tagging
- from nltk.classify import NaiveBayesClassifier as nbc, accuracy #training and testing data
- import pickle
- from nltk.corpus import movie_reviews
- negative_review = []
- positive_review = []
- y = 0
- s_words = sw.words('english')
- #Get data from nltk.corpus, movie_reviews
- for x in movie_reviews.categories():
- for i in movie_reviews.fileids(x):
- if x == 'neg':
- negative_review += [word for word in movie_reviews.words(i) if word not in s_words]
- else:
- positive_review += [word for word in movie_reviews.words(i) if word not in s_words]
- if y == 10:
- break
- else:
- y += 1
- negative_review = negative_review[:5000]
- positive_review = positive_review[:5000]
- #extract data
- def extract(word, list_of_words, category):
- if word not in list_of_words:
- return ({word: False}, category)
- else:
- return({word: True}, category)
- #filter data with stem and lemmatizer
- stemmer = ps()
- lemmatizer = wnl()
- negative_review = [stemmer.stem(w) for w in negative_review]
- positive_review = [stemmer.stem(w) for w in positive_review]
- negative_review = [lemmatizer.lemmatize(w) for w in negative_review]
- positive_review = [lemmatizer.lemmatize(w) for w in positive_review]
- negative_review = [extract(w, positive_review, 'negative') for w in negative_review]
- positive_review = [extract(w, negative_review, 'positive') for w in positive_review]
- idx = int(.8 * 5000)
- #Prepping the data
- train_data = negative_review[:idx] + positive_review[:idx]
- test_data = negative_review[idx:] + positive_review[idx:]
- print(len(train_data))
- print(len(test_data))
- ##Train the model
- model = nbc.train(train_data)
- ##Check accuracy
- acc = accuracy(model, test_data) * 100
- print(acc)
- words_input = input('input words: ')
- neg = 0
- pos = 0
- words = wtok(words_input)
- def extract_input(w):
- if w not in negative_review:
- return {w:True}
- return {w:False}
- for w in words:
- w = stemmer.stem(w)
- w = lemmatizer.lemmatize(w)
- res = model.classify(extract_input(w))
- if res == 'positive':
- pos += 1
- if pos > len(words) / 2:
- print('Review is positive')
- else:
- print('Review is negative')
- w = words[0]
- for syn in wordnet.synsets(w):
- for s in syn.lemmas():
- print(s.name())
- for a in s.antonyms():
- print('\t' + a.name())
- #save data
- save_data = open('data.pickle', 'wb')#wb = write byte
- pickle.dump(model, save_data)
- save_data.close()
- #load data
- load_data = open('data.pickle', 'rb')#rb = read byte
- model = pickle.load(load_data)
- load_data.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement