Advertisement
Guest User

Untitled

a guest
Dec 9th, 2016
61
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.94 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. # импорт нужных функций и библиотек
  3. import numpy as np
  4. from pandas import read_csv
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. from sklearn.cross_validation import train_test_split
  7. from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
  8. from sklearn import metrics
  9.  
  10. # читаем данные и соединяем в один массив
  11. df1 = read_csv("./twitter/positive.csv", ";")
  12. df2 = read_csv("./twitter/negative.csv", ";")
  13. df = np.concatenate((df1, df2), axis=0)
  14. # берём 4 и 5 столбцы - текст сообщения и тип твита
  15. X = df[:, 3]
  16. Y = df[:, 4]
  17.  
  18.  
  19. def classify(maxf, size, cl):
  20.  
  21. # преобразование текста в массив
  22. vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=maxf)
  23. train_data_features = vectorizer.fit_transform(X)
  24. train_data_features = train_data_features.toarray()
  25.  
  26. # разделение на тестовую и обучающую выборки
  27. x_train, x_test, y_train, y_test = train_test_split(train_data_features, Y, test_size=size, random_state=0)
  28. y_train = y_train.astype(int)
  29. y_test = y_test.astype(int)
  30.  
  31. # выбор классификатора
  32. if cl == 'multinomial':
  33. clf = MultinomialNB()
  34. if cl == 'bernoulli':
  35. clf = BernoulliNB()
  36. if cl == 'gaussian':
  37. clf = GaussianNB()
  38.  
  39. # построение модели, прогнозирование значений, вывод оценки качества
  40. clf.fit(x_train, y_train)
  41. res = clf.predict(x_test)
  42. print "Max feature = %s; NB = %s; score = %s;" % \
  43. (maxf, cl, metrics.accuracy_score(y_test, res))
  44.  
  45.  
  46. classify(3000, 0.2, 'multinomial')
  47. classify(5000, 0.2, 'multinomial')
  48. classify(7000, 0.2, 'multinomial')
  49. classify(3000, 0.2, 'gaussian')
  50. classify(5000, 0.2, 'gaussian')
  51. classify(7000, 0.2, 'gaussian')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement