Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###############################################
- # BIG DATA FINAL PROJECT
- ###############################################
- #**********************************************
- # IMPORT LIBRARIES
- #**********************************************
- import numpy as np
- import pandas as pd
- import collections
- from nltk.tokenize import RegexpTokenizer
- from nltk.stem.snowball import SnowballStemmer
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.naive_bayes import MultinomialNB
- from sklearn import svm
- from sklearn.metrics import confusion_matrix
- from sklearn.pipeline import Pipeline
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split, cross_val_score, KFold
- from sklearn.model_selection import GridSearchCV
- # Libraries for text preprocessing
- import re
- import nltk
- nltk.download('stopwords') #DOWNLOAD THIS IF YOU HAVE NEVER DOWNLOADED BEFORE
- from nltk.corpus import stopwords
- from nltk.stem.porter import PorterStemmer
- from nltk.tokenize import RegexpTokenizer
- nltk.download('wordnet') #DOWNLOAD THIS IF YOU HAVE NEVER DOWNLOADED BEFORE
- from nltk.stem.wordnet import WordNetLemmatizer
- # Commented out IPython magic to ensure Python compatibility.
- # Word cloud
- from os import path
- from PIL import Image
- from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
- import matplotlib.pyplot as plt
- import seaborn as sns
- #**********************************************
- # SET CONSTANTS
- #**********************************************
- NEGATIVE = 0
- NEUTRAL = 1
- POSITIVE = 2
- #**********************************************
- # PREPARE TRAINING AND TESTING DATASET
- #**********************************************
- path_for_train = 'tweets_GroundTruth.txt'
- dataset = pd.read_csv(path_for_train, sep = '\t',header = None)
- dataset.columns = ['id', 'score', 'tweet']
- dataset['word_count'] = dataset['tweet'].apply(lambda x: len(str(x).split(" ")))
- dataset[['tweet','word_count']].head()
- # Identify common words
- freq = pd.Series(' '.join(dataset['tweet']).split()).value_counts()[:20]
- # Identify uncommon words
- freq1 = pd.Series(' '.join(dataset
- ['tweet']).split()).value_counts()[-20:]
- freq1
- # Creating a list of stop words and adding custom stopwords
- stop_words = set(stopwords.words("english"))
- to_discard = ['not', 'nor' , 'no']
- for word in to_discard:
- stop_words.discard(word)
- # Creating a list of custom stopwords
- new_words = ["anonymous", 'http', 'url']
- stop_words = stop_words.union(new_words)
- corpus = []
- dataset["Abstract"] = ""
- for i in range(0, len(dataset)):
- # Remove punctuations
- text = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
- # Convert to lowercase
- text = text.lower()
- # Remove tags
- text=re.sub("</?.*?>"," <> ",text)
- # Remove special characters and digits
- text=re.sub("(\\d|\\W)+"," ",text)
- # Convert to list from string
- text = text.split()
- # Stemming
- ps=PorterStemmer()
- # Lemmatisation
- lem = WordNetLemmatizer()
- text = [lem.lemmatize(word) for word in text if not word in
- stop_words]
- text = " ".join(text)
- corpus.append(text)
- dataset.loc[i,"Abstract"] = corpus[i]
- # Most frequently occuring words
- def get_top_n_words(corpus, n=None):
- vec = CountVectorizer().fit(corpus)
- bag_of_words = vec.transform(corpus)
- sum_words = bag_of_words.sum(axis=0)
- words_freq = [(word, sum_words[0, idx]) for word, idx in
- vec.vocabulary_.items()]
- words_freq =sorted(words_freq, key = lambda x: x[1],
- reverse=True)
- return words_freq[:n]
- # Convert most freq words to dataframe for plotting bar plot
- top_words = get_top_n_words(corpus, n=20)
- top_df = pd.DataFrame(top_words)
- top_df.columns=["Word", "Freq"]
- # Barplot of most freq words
- sns.set(rc={'figure.figsize':(13,8)})
- g = sns.barplot(x="Word", y="Freq", data=top_df)
- g.set_xticklabels(g.get_xticklabels(), rotation=30)
- #g.figure.savefig("/Users/jeroz/Desktop/NLP/mono-gram_review.png")
- # Most frequently occuring Bi-grams
- def get_top_n2_words(corpus, n=None):
- vec1 = CountVectorizer(ngram_range=(2,2),
- max_features=2000).fit(corpus)
- bag_of_words = vec1.transform(corpus)
- sum_words = bag_of_words.sum(axis=0)
- words_freq = [(word, sum_words[0, idx]) for word, idx in
- vec1.vocabulary_.items()]
- words_freq =sorted(words_freq, key = lambda x: x[1],
- reverse=True)
- return words_freq[:n]
- top2_words = get_top_n2_words(corpus, n=20)
- top2_df = pd.DataFrame(top2_words)
- top2_df.columns=["Bi-gram", "Freq"]
- #Barplot of most freq Bi-grams
- sns.set(rc={'figure.figsize':(15,17)})
- h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
- h.set_xticklabels(h.get_xticklabels(), rotation=45)
- #h.figure.savefig("/Users/jeroz/Desktop/NLP/bi-gram_review")
- # Most frequently occuring Tri-grams
- def get_top_n3_words(corpus, n=None):
- vec1 = CountVectorizer(ngram_range=(3,3),
- max_features=2000).fit(corpus)
- bag_of_words = vec1.transform(corpus)
- sum_words = bag_of_words.sum(axis=0)
- words_freq = [(word, sum_words[0, idx]) for word, idx in
- vec1.vocabulary_.items()]
- words_freq =sorted(words_freq, key = lambda x: x[1],
- reverse=True)
- return words_freq[:n]
- top3_words = get_top_n3_words(corpus, n=20)
- top3_df = pd.DataFrame(top3_words)
- top3_df.columns=["Tri-gram", "Freq"]
- # Barplot of most freq Tri-grams
- sns.set(rc={'figure.figsize':(15,17)})
- j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
- j.set_xticklabels(j.get_xticklabels(), rotation=45)
- #j.figure.savefig("/Users/jeroz/Desktop/NLP/tri-gram_review.png")
- df = dataset[['score', 'tweet', 'Abstract']]
- # Set the thresholds
- negative = -0.55
- neutral = 1.2
- count_ne =0
- count_po = 0
- count_ng=0
- for i in range(0,len(df)):
- score = df.loc[i,'score']
- if score <= negative:
- df.loc[i,"sentiment"] = NEGATIVE
- count_ng+=1
- elif score < neutral:
- # if score >=1.1:
- # print(score, '\t', df.loc[i,"tweet"])
- # input('*******')
- df.loc[i,"sentiment"] = NEUTRAL
- count_ne+=1
- else:
- df.loc[i,"sentiment"] = POSITIVE
- count_po+=1
- # Split the data
- X_train, X_test, y_train, y_test = train_test_split(df.Abstract, df.sentiment, test_size=0.4, random_state=17)
- # Word count vectorizer
- cv = CountVectorizer(ngram_range= (1,1))
- #**********************************************
- # PREPARE VALIDATION DATASET
- #**********************************************
- #**********************************************
- # [NBC] TRAIN THE MODEL
- #**********************************************
- X_train_tf = cv.fit_transform(X_train)
- alpha = 1
- class_prior = [0.25,0.36,0.38]
- fit_prior = False
- # df.sentiment.value_counts()
- mnb = MultinomialNB(alpha = alpha, class_prior=class_prior, fit_prior = fit_prior)
- # mnb = MultinomialNB()
- mnb.fit(X_train_tf, y_train)
- #**********************************************
- # [NBC] TEST THE MODEL
- #**********************************************
- X_test_tf = cv.transform(X_test)
- y_pred = mnb.predict(X_test_tf)
- # print('\n\n\nTEST CONFUSION MATRIX', alpha)
- # print(confusion_matrix(y_true=y_test, y_pred=y_pred),'\n\n')
- # #print(y_test)
- # print('POSITIVE', count_po)
- # print('NEUTRAL', count_ne)
- # print('NEGATIVE', count_ng)
- # # **********************************************
- # # [NBC] USE MODEL TO VALIDATE THE DATASET
- # # **********************************************
- # val_data_tf= cv.fit_transform(val_data.Abstract)
- # val_data_pred= mnb.predict(val_data_tf)
- # print(val_data.info())
- #**********************************************
- # [SVM] TRAIN THE MODEL
- #**********************************************
- clf = svm.SVC(kernel = 'linear', C=1)
- # scores = cross_val_score(clf, X_train_tf, y_train, cv= 5, scoring = 'f1')
- classifier_linear = svm.SVC(kernel = 'linear', C= 0.27)
- classifier_linear.fit(X_train_tf, y_train)
- # # defining parameter range
- # param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 0.27, 0.55, 1, 2, 5],
- # 'kernel': ['linear'],
- # 'class_weight': [{0: w} for w in [1, 2, 4, 6, 8, 10]]}
- # grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3, scoring = "f1")
- # # fitting the model for grid search
- # grid.fit(X_train_tf, y_train)
- #**********************************************
- # [SVM] TEST THE MODEL
- #**********************************************
- y_pred = classifier_linear.predict(X_test_tf)
- # print('\n\n\nTEST CONFUSION MATRIX', 'C = ', c)
- print(confusion_matrix(y_true=y_test, y_pred=y_pred),'\n\n')
- # # **********************************************
- # # [SVM] USE MODEL TO VALIDATE THE DATASET
- # # **********************************************
- # val_data_tf= cv.fit_transform(val_data.Abstract)
- # val_data_pred= classifier_linear.predict(val_data_tf)
- # print(val_data.info())
- # # **********************************************
- # # USE MODEL TO EVALUATE
- # # **********************************************
- eval_data = pd.read_csv("training.1600000.processed.noemoticon.csv", sep = ',', header = None, encoding = "ISO-8859-1")
- eval_data.columns = ['score', '?', 'date', '??', 'user', 'tweet' ]
- eval_data = eval_data[['tweet']]
- # print(eval_data.head())
- # eval_data['score'] = eval_data['score']/2
- # X_train_tf = cv.fit_transform(X_train)
- #Creating a list of stop words and adding custom stopwords
- stop_words = set(stopwords.words("english"))
- to_discard = ['not', 'nor' , 'no']
- for word in to_discard:
- stop_words.discard(word)
- #Creating a list of custom stopwords
- new_words = ["anonymous", 'http', 'url']
- stop_words = stop_words.union(new_words)
- # print(stop_words) #BECAREFUL wasn't/isn't....
- corpus = []
- eval_data["Abstract"] = ""
- for i in range(0, len(eval_data)):
- # Remove punctuations
- text = re.sub('[^a-zA-Z]', ' ', eval_data['tweet'][i])
- #Convert to lowercase
- text = text.lower()
- #remove tags
- text=re.sub("</?.*?>"," <> ",text)
- # remove special characters and digits
- text=re.sub("(\\d|\\W)+"," ",text)
- #Convert to list from string
- text = text.split()
- #Stemming
- ps=PorterStemmer()
- # Lemmatisation
- lem = WordNetLemmatizer()
- text = [lem.lemmatize(word) for word in text if not word in
- stop_words]
- text = " ".join(text)
- corpus.append(text)
- eval_data.loc[i,"Abstract"] = corpus[i]
- eval_data_tf = cv.transform(eval_data.Abstract)
- y_pred_svm = classifier_linear.predict(eval_data_tf)
- y_pred_nbc = mnb.predict(eval_data_tf)
- import io
- with io.open('evaluation.csv', "w+", encoding="utf-8") as f:
- f.write('tweet\tnbc_label\tsvm_label\n')
- tag = ''
- for tweet, label, label2 in zip(eval_data.tweet, y_pred_nbc, y_pred_svm):
- if label == POSITIVE:
- tag = 'POSITIVE'
- elif label == NEUTRAL:
- tag = 'NEUTRAL'
- else:
- tag = 'NEGATIVE'
- if label2 == POSITIVE:
- tag2 = 'POSITIVE'
- elif label2 == NEUTRAL:
- tag2 = 'NEUTRAL'
- else:
- tag2 = 'NEGATIVE'
- f.write(tweet)
- f.write('\t')
- f.write(tag)
- f.write('\t')
- f.write(tag2)
- f.write('\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement