Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- import time
- import csv
- import string
- from sklearn.cross_validation import train_test_split
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- # Importing dataset
- data = pd.read_csv("test.csv", quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL, skipinitialspace=True,error_bad_lines=False)
- df2 = data.set_index("name", drop = False)
- df2['sentiment'] = df2['rating'].apply(lambda rating : +1 if rating > 3 else -1)
- train, test = train_test_split(df2, test_size=0.2)
- count_vect = CountVectorizer()
- X_train_counts = count_vect.fit_transform(train)
- test_matrix = count_vect.transform(test)
- clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
- clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
- X : {array-like, sparse matrix}, shape = [n_samples, n_features]
- Training vectors, where n_samples is the number of samples and n_features is
- the number of features.
- y : array-like, shape = [n_samples]
- Target values.
- from sklearn.datasets import fetch_20newsgroups
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn import metrics
- newsgroups_train = fetch_20newsgroups(subset='train')
- categories = ['alt.atheism', 'talk.religion.misc',
- 'comp.graphics', 'sci.space']
- newsgroups_train = fetch_20newsgroups(subset='train',
- categories=categories)
- vectorizer = TfidfVectorizer()
- # the following will be the training data
- vectors = vectorizer.fit_transform(newsgroups_train.data)
- vectors.shape
- newsgroups_test = fetch_20newsgroups(subset='test',
- categories=categories)
- # this is the test data
- vectors_test = vectorizer.transform(newsgroups_test.data)
- clf = MultinomialNB(alpha=.01)
- # the fitting is done using the TRAINING data
- # Check the shapes before fitting
- vectors.shape
- #(2034, 34118)
- newsgroups_train.target.shape
- #(2034,)
- # fit the model using the TRAINING data
- clf.fit(vectors, newsgroups_train.target)
- # the PREDICTION is done using the TEST data
- pred = clf.predict(vectors_test)
Add Comment
Please, Sign In to add comment