Advertisement
Guest User

Untitled

a guest
Apr 5th, 2020
324
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.03 KB | None | 0 0
  1. import re
  2. import nltk
  3. from nltk.stem import PorterStemmer
  4. from nltk.classify.scikitlearn import SklearnClassifier
  5. from sklearn.naive_bayes import BernoulliNB
  6. from sklearn.ensemble import RandomForestClassifier
  7. from random import shuffle
  8. import pickle
  9.  
  10. class ClassifierCSV:
  11.     def __init__(self, csv_file, featureset_size=1000, test_ratio=0.1):
  12.         self.csv_file = csv_file
  13.         self.documents = []
  14.         self.words = []
  15.         self.featureset_size = featureset_size
  16.         self.test_ratio = test_ratio
  17.         self.feature_words = None
  18.         self.classifier = None
  19.  
  20.     def _read_csv(self):
  21.         port_stem = PorterStemmer()
  22.         with open(self.csv_file, 'r') as input_csv:
  23.             for item in input_csv:
  24.                 item = item.split(',')
  25.                 doc, label = re.findall('\w+', ''.join(item[:-1]).lower()), item[-1].strip()
  26.                 post_stem_arr = list()
  27.                 for word in doc:
  28.                     self.words.append(port_stem.stem(word.lower()))
  29.                     post_stem_arr.append(port_stem.stem(word))
  30.                 self.documents.append((post_stem_arr, label))
  31.              
  32.     def _generate_word_features(self):
  33.         frequency_dist = nltk.FreqDist()
  34.         for word in self.words:
  35.             frequency_dist[word] += 1
  36.         self.feature_words = list(frequency_dist)[:self.featureset_size]
  37.  
  38.     def __document_features(self, document):
  39.         document_words = set(document)
  40.         features = {}
  41.         for word in self.feature_words:
  42.             features['contains({})'.format(word)] = (word in document_words)
  43.         return features
  44.  
  45.     def train_sklearn_classifier(self, sk_learn_classifier):
  46.         if not self.feature_words:
  47.             self._read_csv()
  48.             self._generate_word_features()
  49.         shuffle(self.documents)
  50.         feature_sets = [(self.__document_features(d), c) for (d, c) in self.documents]
  51.         cutoff = int(len(feature_sets) * self.test_ratio)
  52.         train_set, test_set = feature_sets[cutoff:], feature_sets[:cutoff]
  53.         self.classifier = SklearnClassifier(sk_learn_classifier()).train(train_set)
  54.         print('Achieved {0:.2f}% accuracy against training set'.format(nltk.classify.accuracy(self.classifier, train_set)*100))
  55.         print('Achieved {0:.2f}% accuracy against test set'.format(nltk.classify.accuracy(self.classifier, test_set)*100))
  56.  
  57.     def classify_new_sentence(self, sentence):
  58.         port_stem = PorterStemmer()
  59.         stemmed_sentence = list()
  60.         for word in sentence.split():
  61.             stemmed_sentence.append(port_stem.stem(word))
  62.         sentence = " ".join(stemmed_sentence)
  63.         if not self.feature_words:
  64.             self._read_csv()
  65.             self._generate_word_features()
  66.         test_features = {}
  67.         for word in self.feature_words:
  68.             test_features['contains({})'.format(word.lower())] = (word.lower() in nltk.word_tokenize(sentence))
  69.         return self.classifier.classify(test_features)
  70.  
  71.     def save_model(self, filename):
  72.         save_classifier = open(filename, "wb")
  73.         pickle.dump(self.classifier, save_classifier)
  74.         save_classifier.close()
  75.         save_vocab = open('vocab-{}'.format(filename), "wb")
  76.         pickle.dump(self.feature_words, save_vocab)
  77.         save_vocab.close()
  78.  
  79.     def load_model(self, model_filename, vocab_filename):
  80.         classifier_f = open(model_filename, "rb")
  81.         self.classifier = pickle.load(classifier_f)
  82.         classifier_f.close()
  83.         vocab_f = open(vocab_filename, "rb")
  84.         self.feature_words = pickle.load(vocab_f)
  85.         vocab_f.close()
  86.  
  87. c = ClassifierCSV('../IMDB_Dataset.csv', featureset_size=5000)
  88. b = ClassifierCSV('../IMDB_Dataset.csv', featureset_size=5000)
  89.  
  90. # c.train_sklearn_classifier(RandomForestClassifier)
  91. # c.save_model("imdb2.pkl")
  92. c.load_model("imdb2.pkl", "vocab-imdb2.pkl")
  93. b.load_model("imdb.pkl", "vocab-imdb.pkl")
  94.  
  95. test = "This movie was amazing"
  96. result1 = c.classify_new_sentence(test)
  97. result2 = c.classify_new_sentence(test)
  98.  
  99. print(result1, result2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement