Advertisement
Guest User

Untitled

a guest
Feb 26th, 2020
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 11.33 KB | None | 0 0
  1. ###############################################
  2. #     BIG DATA FINAL PROJECT
  3. ###############################################
  4.  
  5. #**********************************************
  6. #     IMPORT LIBRARIES
  7. #**********************************************
  8. import numpy as np
  9. import pandas as pd
  10. import collections
  11. from nltk.tokenize import RegexpTokenizer
  12. from nltk.stem.snowball import SnowballStemmer
  13. from sklearn.feature_extraction.text import CountVectorizer
  14. from sklearn.naive_bayes import MultinomialNB
  15. from sklearn import svm
  16. from sklearn.metrics import confusion_matrix
  17. from sklearn.pipeline import Pipeline
  18. from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  19. from sklearn.linear_model import LogisticRegression
  20. from sklearn.model_selection import train_test_split, cross_val_score, KFold
  21. from sklearn.model_selection import GridSearchCV
  22.  
  23.  
  24. # Libraries for text preprocessing
  25. import re
  26. import nltk
  27. nltk.download('stopwords') #DOWNLOAD THIS IF YOU HAVE NEVER DOWNLOADED BEFORE
  28. from nltk.corpus import stopwords
  29. from nltk.stem.porter import PorterStemmer
  30. from nltk.tokenize import RegexpTokenizer
  31. nltk.download('wordnet') #DOWNLOAD THIS IF YOU HAVE NEVER DOWNLOADED BEFORE
  32. from nltk.stem.wordnet import WordNetLemmatizer
  33.  
  34. # Commented out IPython magic to ensure Python compatibility.
  35. # Word cloud
  36. from os import path
  37. from PIL import Image
  38. from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
  39. import matplotlib.pyplot as plt
  40.  
  41. import seaborn as sns
  42.  
  43. #**********************************************
  44. #     SET CONSTANTS
  45. #**********************************************
  46.  
  47. NEGATIVE = 0
  48. NEUTRAL = 1
  49. POSITIVE = 2
  50.  
  51. #**********************************************
  52. #     PREPARE TRAINING AND TESTING DATASET
  53. #**********************************************
  54.  
  55. path_for_train = 'tweets_GroundTruth.txt'
  56. dataset = pd.read_csv(path_for_train, sep = '\t',header = None)
  57. dataset.columns = ['id', 'score', 'tweet']
  58.  
  59. dataset['word_count'] = dataset['tweet'].apply(lambda x: len(str(x).split(" ")))
  60. dataset[['tweet','word_count']].head()
  61.  
  62. # Identify common words
  63. freq = pd.Series(' '.join(dataset['tweet']).split()).value_counts()[:20]
  64.  
  65. # Identify uncommon words
  66. freq1 =  pd.Series(' '.join(dataset
  67.          ['tweet']).split()).value_counts()[-20:]
  68. freq1
  69.  
  70. # Creating a list of stop words and adding custom stopwords
  71. stop_words = set(stopwords.words("english"))
  72. to_discard = ['not', 'nor' , 'no']
  73. for word in to_discard:
  74.   stop_words.discard(word)
  75.  
  76. # Creating a list of custom stopwords
  77. new_words = ["anonymous", 'http', 'url']
  78. stop_words = stop_words.union(new_words)
  79.  
  80. corpus = []
  81. dataset["Abstract"] = ""
  82.  
  83. for i in range(0, len(dataset)):
  84.     # Remove punctuations
  85.     text = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
  86.    
  87.     # Convert to lowercase
  88.     text = text.lower()
  89.    
  90.     # Remove tags
  91.     text=re.sub("</?.*?>"," <> ",text)
  92.    
  93.     # Remove special characters and digits
  94.     text=re.sub("(\\d|\\W)+"," ",text)
  95.    
  96.     # Convert to list from string
  97.     text = text.split()
  98.    
  99.     # Stemming
  100.     ps=PorterStemmer()
  101.  
  102.     # Lemmatisation
  103.     lem = WordNetLemmatizer()
  104.     text = [lem.lemmatize(word) for word in text if not word in  
  105.             stop_words]
  106.     text = " ".join(text)
  107.     corpus.append(text)
  108.     dataset.loc[i,"Abstract"] = corpus[i]
  109.  
  110. # Most frequently occuring words
  111. def get_top_n_words(corpus, n=None):
  112.     vec = CountVectorizer().fit(corpus)
  113.     bag_of_words = vec.transform(corpus)
  114.     sum_words = bag_of_words.sum(axis=0)
  115.     words_freq = [(word, sum_words[0, idx]) for word, idx in      
  116.                    vec.vocabulary_.items()]
  117.     words_freq =sorted(words_freq, key = lambda x: x[1],
  118.                        reverse=True)
  119.     return words_freq[:n]
  120.  
  121. # Convert most freq words to dataframe for plotting bar plot
  122. top_words = get_top_n_words(corpus, n=20)
  123. top_df = pd.DataFrame(top_words)
  124. top_df.columns=["Word", "Freq"]
  125.  
  126. # Barplot of most freq words
  127. sns.set(rc={'figure.figsize':(13,8)})
  128. g = sns.barplot(x="Word", y="Freq", data=top_df)
  129. g.set_xticklabels(g.get_xticklabels(), rotation=30)
  130. #g.figure.savefig("/Users/jeroz/Desktop/NLP/mono-gram_review.png")
  131.  
  132. # Most frequently occuring Bi-grams
  133. def get_top_n2_words(corpus, n=None):
  134.     vec1 = CountVectorizer(ngram_range=(2,2),  
  135.             max_features=2000).fit(corpus)
  136.     bag_of_words = vec1.transform(corpus)
  137.     sum_words = bag_of_words.sum(axis=0)
  138.     words_freq = [(word, sum_words[0, idx]) for word, idx in    
  139.                   vec1.vocabulary_.items()]
  140.     words_freq =sorted(words_freq, key = lambda x: x[1],
  141.                 reverse=True)
  142.     return words_freq[:n]
  143. top2_words = get_top_n2_words(corpus, n=20)
  144. top2_df = pd.DataFrame(top2_words)
  145. top2_df.columns=["Bi-gram", "Freq"]
  146.  
  147. #Barplot of most freq Bi-grams
  148. sns.set(rc={'figure.figsize':(15,17)})
  149. h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
  150. h.set_xticklabels(h.get_xticklabels(), rotation=45)
  151. #h.figure.savefig("/Users/jeroz/Desktop/NLP/bi-gram_review")
  152.  
  153. # Most frequently occuring Tri-grams
  154. def get_top_n3_words(corpus, n=None):
  155.     vec1 = CountVectorizer(ngram_range=(3,3),
  156.            max_features=2000).fit(corpus)
  157.     bag_of_words = vec1.transform(corpus)
  158.     sum_words = bag_of_words.sum(axis=0)
  159.     words_freq = [(word, sum_words[0, idx]) for word, idx in    
  160.                   vec1.vocabulary_.items()]
  161.     words_freq =sorted(words_freq, key = lambda x: x[1],
  162.                 reverse=True)
  163.     return words_freq[:n]
  164. top3_words = get_top_n3_words(corpus, n=20)
  165. top3_df = pd.DataFrame(top3_words)
  166. top3_df.columns=["Tri-gram", "Freq"]
  167.  
  168. # Barplot of most freq Tri-grams
  169. sns.set(rc={'figure.figsize':(15,17)})
  170. j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
  171. j.set_xticklabels(j.get_xticklabels(), rotation=45)
  172. #j.figure.savefig("/Users/jeroz/Desktop/NLP/tri-gram_review.png")
  173.  
  174. df = dataset[['score', 'tweet', 'Abstract']]
  175.  
  176. # Set the thresholds
  177. negative = -0.55
  178. neutral = 1.2
  179.  
  180. count_ne =0
  181. count_po = 0
  182. count_ng=0
  183.  
  184. for i in range(0,len(df)):
  185.   score = df.loc[i,'score']
  186.   if score <= negative:
  187.     df.loc[i,"sentiment"] = NEGATIVE
  188.     count_ng+=1
  189.  
  190.   elif score < neutral:
  191.     # if score >=1.1:
  192.     #   print(score, '\t', df.loc[i,"tweet"])
  193.     #   input('*******')
  194.     df.loc[i,"sentiment"] = NEUTRAL
  195.     count_ne+=1
  196.  
  197.   else:
  198.     df.loc[i,"sentiment"] = POSITIVE
  199.     count_po+=1
  200.  
  201.  
  202. # Split the data
  203. X_train, X_test, y_train, y_test = train_test_split(df.Abstract, df.sentiment, test_size=0.4, random_state=17)
  204.  
  205. # Word count vectorizer
  206. cv = CountVectorizer(ngram_range= (1,1))
  207.  
  208. #**********************************************
  209. #     PREPARE VALIDATION DATASET
  210. #**********************************************
  211.  
  212.  
  213.  
  214. #**********************************************
  215. #     [NBC] TRAIN THE MODEL
  216. #**********************************************
  217. X_train_tf = cv.fit_transform(X_train)
  218.  
  219. alpha = 1
  220. class_prior = [0.25,0.36,0.38]
  221. fit_prior = False
  222.  
  223. # df.sentiment.value_counts()
  224. mnb = MultinomialNB(alpha = alpha, class_prior=class_prior, fit_prior = fit_prior)
  225. # mnb = MultinomialNB()
  226. mnb.fit(X_train_tf, y_train)
  227.  
  228.  
  229. #**********************************************
  230. #     [NBC] TEST THE MODEL
  231. #**********************************************
  232.  
  233. X_test_tf = cv.transform(X_test)
  234. y_pred = mnb.predict(X_test_tf)
  235. # print('\n\n\nTEST CONFUSION MATRIX', alpha)
  236. # print(confusion_matrix(y_true=y_test, y_pred=y_pred),'\n\n')
  237.  
  238. # #print(y_test)
  239.  
  240. # print('POSITIVE', count_po)
  241. # print('NEUTRAL', count_ne)
  242. # print('NEGATIVE', count_ng)
  243.  
  244. # # **********************************************
  245. # #    [NBC] USE MODEL TO VALIDATE THE DATASET
  246. # # **********************************************
  247.  
  248. # val_data_tf= cv.fit_transform(val_data.Abstract)
  249.  
  250. # val_data_pred= mnb.predict(val_data_tf)
  251. # print(val_data.info())
  252.  
  253. #**********************************************
  254. #     [SVM] TRAIN THE MODEL
  255. #**********************************************
  256.  
  257. clf = svm.SVC(kernel = 'linear', C=1)
  258.  
  259. # scores = cross_val_score(clf, X_train_tf, y_train, cv= 5, scoring = 'f1')
  260. classifier_linear = svm.SVC(kernel = 'linear', C= 0.27)
  261. classifier_linear.fit(X_train_tf, y_train)
  262.  
  263. # # defining parameter range
  264. # param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 0.27, 0.55, 1, 2, 5],  
  265. #               'kernel': ['linear'],
  266. #               'class_weight': [{0: w} for w in [1, 2, 4, 6, 8, 10]]}  
  267.  
  268. # grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3, scoring = "f1")
  269.  
  270. # # fitting the model for grid search
  271. # grid.fit(X_train_tf, y_train)
  272.  
  273. #**********************************************
  274. #     [SVM] TEST THE MODEL
  275. #**********************************************
  276. y_pred = classifier_linear.predict(X_test_tf)
  277. # print('\n\n\nTEST CONFUSION MATRIX', 'C = ', c)
  278. print(confusion_matrix(y_true=y_test, y_pred=y_pred),'\n\n')
  279.  
  280.  
  281. # # **********************************************
  282. # #    [SVM] USE MODEL TO VALIDATE THE DATASET
  283. # # **********************************************
  284.  
  285. # val_data_tf= cv.fit_transform(val_data.Abstract)
  286.  
  287. # val_data_pred= classifier_linear.predict(val_data_tf)
  288. # print(val_data.info())
  289.  
  290. # # **********************************************
  291. # #    USE MODEL TO EVALUATE
  292. # # **********************************************
  293. eval_data = pd.read_csv("training.1600000.processed.noemoticon.csv", sep = ',', header = None, encoding = "ISO-8859-1")
  294. eval_data.columns = ['score', '?', 'date', '??', 'user', 'tweet' ]
  295. eval_data = eval_data[['tweet']]
  296. # print(eval_data.head())
  297.  
  298. # eval_data['score'] = eval_data['score']/2
  299.  
  300. # X_train_tf = cv.fit_transform(X_train)
  301.  
  302. #Creating a list of stop words and adding custom stopwords
  303. stop_words = set(stopwords.words("english"))
  304. to_discard = ['not', 'nor' , 'no']
  305. for word in to_discard:
  306.   stop_words.discard(word)
  307.  
  308.  
  309. #Creating a list of custom stopwords
  310. new_words = ["anonymous", 'http', 'url']
  311. stop_words = stop_words.union(new_words)
  312.  
  313. # print(stop_words)  #BECAREFUL wasn't/isn't....
  314.  
  315. corpus = []
  316. eval_data["Abstract"] = ""
  317.  
  318. for i in range(0, len(eval_data)):
  319.   # Remove punctuations
  320.     text = re.sub('[^a-zA-Z]', ' ', eval_data['tweet'][i])
  321.    
  322.     #Convert to lowercase
  323.     text = text.lower()
  324.    
  325.     #remove tags
  326.     text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
  327.    
  328.     # remove special characters and digits
  329.     text=re.sub("(\\d|\\W)+"," ",text)
  330.    
  331.     #Convert to list from string
  332.     text = text.split()
  333.    
  334.     #Stemming
  335.     ps=PorterStemmer()
  336.     # Lemmatisation
  337.     lem = WordNetLemmatizer()
  338.     text = [lem.lemmatize(word) for word in text if not word in  
  339.             stop_words]
  340.     text = " ".join(text)
  341.     corpus.append(text)
  342.     eval_data.loc[i,"Abstract"] = corpus[i]
  343.  
  344. eval_data_tf = cv.transform(eval_data.Abstract)
  345. y_pred_svm = classifier_linear.predict(eval_data_tf)
  346. y_pred_nbc = mnb.predict(eval_data_tf)
  347.  
  348.  
  349. import io
  350. with io.open('evaluation.csv', "w+", encoding="utf-8") as f:
  351.   f.write('tweet\tnbc_label\tsvm_label\n')
  352.   tag = ''
  353.   for tweet, label, label2 in zip(eval_data.tweet, y_pred_nbc, y_pred_svm):
  354.     if label == POSITIVE:
  355.       tag = 'POSITIVE'
  356.     elif label == NEUTRAL:
  357.       tag = 'NEUTRAL'
  358.     else:
  359.       tag = 'NEGATIVE'
  360.     if label2 == POSITIVE:
  361.       tag2 = 'POSITIVE'
  362.     elif label2 == NEUTRAL:
  363.       tag2 = 'NEUTRAL'
  364.     else:
  365.       tag2 = 'NEGATIVE'
  366.     f.write(tweet)
  367.     f.write('\t')
  368.     f.write(tag)
  369.     f.write('\t')
  370.     f.write(tag2)
  371.     f.write('\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement