Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- # импорт нужных функций и библиотек
- import numpy as np
- from pandas import read_csv
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.cross_validation import train_test_split
- from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
- from sklearn import metrics
- # читаем данные и соединяем в один массив
- df1 = read_csv("./twitter/positive.csv", ";")
- df2 = read_csv("./twitter/negative.csv", ";")
- df = np.concatenate((df1, df2), axis=0)
- # берём 4 и 5 столбцы - текст сообщения и тип твита
- X = df[:, 3]
- Y = df[:, 4]
- def classify(maxf, size, cl):
- # преобразование текста в массив
- vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=maxf)
- train_data_features = vectorizer.fit_transform(X)
- train_data_features = train_data_features.toarray()
- # разделение на тестовую и обучающую выборки
- x_train, x_test, y_train, y_test = train_test_split(train_data_features, Y, test_size=size, random_state=0)
- y_train = y_train.astype(int)
- y_test = y_test.astype(int)
- # выбор классификатора
- if cl == 'multinomial':
- clf = MultinomialNB()
- if cl == 'bernoulli':
- clf = BernoulliNB()
- if cl == 'gaussian':
- clf = GaussianNB()
- # построение модели, прогнозирование значений, вывод оценки качества
- clf.fit(x_train, y_train)
- res = clf.predict(x_test)
- print "Max feature = %s; NB = %s; score = %s;" % \
- (maxf, cl, metrics.accuracy_score(y_test, res))
- classify(3000, 0.2, 'multinomial')
- classify(5000, 0.2, 'multinomial')
- classify(7000, 0.2, 'multinomial')
- classify(3000, 0.2, 'gaussian')
- classify(5000, 0.2, 'gaussian')
- classify(7000, 0.2, 'gaussian')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement