Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Importing the libraries
- import numpy as np
- import matplotlib.pyplot as plt
- import pandas as pd
- # Importing the dataset
- dataset = pd.read_csv('OpeningLines2.txt', delimiter = '\t')
- dataset.head()
- # EDA
- dataset.describe()
- dataset.groupby('Review').describe()
- dataset['length'] = dataset['Opening'].apply(len)
- dataset.head()
- # Cleaning the texts
- import re
- import nltk
- nltk.download('stopwords')
- from nltk.corpus import stopwords
- from nltk.stem.porter import PorterStemmer
- corpus = []
- for i in range(0, 167):
- opening = re.sub('[^a-zA-Z]', ' ', dataset['Opening'][i])
- opening = opening.lower()
- opening = opening.split()
- ps = PorterStemmer()
- opening = [ps.stem(word) for word in opening if not word in set(stopwords.words('english'))]
- opening = ' '.join(opening)
- corpus.append(opening)
- # Creating the Bag of Words model
- from sklearn.feature_extraction.text import CountVectorizer
- cv = CountVectorizer(max_features = 1000)
- X = cv.fit_transform(corpus).toarray()
- y = dataset.iloc[:, 2].values
- # Splitting the dataset into the Training set and Test set
- from sklearn.cross_validation import train_test_split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
- # Fitting Naive Bayes to the Training set
- from sklearn.naive_bayes import GaussianNB
- classifier = GaussianNB()
- classifier.fit(X_train, y_train)
- # Predicting the Test set results
- y_pred = classifier.predict(X_test)
- # Making the Confusion Matrix
- from sklearn.metrics import confusion_matrix
- cm = confusion_matrix(y_test, y_pred)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement