Advertisement
Guest User

Untitled

a guest
Jun 18th, 2019
140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.06 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import sys
  4. import json
  5. import re
  6. import pickle
  7. import time
  8. import datetime
  9.  
  10. import nltk
  11. from nltk.corpus import stopwords
  12. from nltk.stem.porter import PorterStemmer
  13.  
  14. from keras.preprocessing.text import Tokenizer
  15. from keras.preprocessing.sequence import pad_sequences
  16. from keras.utils import np_utils
  17. from keras.models import Model, Sequential
  18. from keras.layers import Input, Dense, Flatten, Conv1D, MaxPooling1D, Dropout, LSTM, TimeDistributed, Activation, BatchNormalization
  19. from keras.callbacks import EarlyStopping, ModelCheckpoint
  20. from keras.layers.embeddings import Embedding
  21. from keras import optimizers
  22. from keras.models import load_model
  23.  
  24. from sklearn.model_selection import train_test_split, StratifiedKFold
  25. from sklearn.utils import class_weight as cw
  26.  
  27. from gensim.models import word2vec
  28. from nltk.stem.wordnet import WordNetLemmatizer
  29.  
  30. import matplotlib.pyplot as plt
  31.  
  32. nltk.download('punkt')
  33. nltk.download('stopwords')
  34. nltk.download('wordnet')
  35.  
  36. EPOCHS = 10
  37. BATCH_SIZE= 512
  38. EMBED_DIMS = 200
  39.  
  40. MAX_LEN = 186
  41.  
  42. MODEL_FILE = 'model.hdf5'
  43. CHECKPOINT_FILE = 'checkpoint.hdf5'
  44. VECTORIZER_FILE = 'vect.pkl'
  45.  
  46. class AirlineSentiment:
  47. def __init__(self, text_preprocessor):
  48. self.df = self.get_data()
  49. self.df['max_len'] = self.df['text'].apply(lambda x: len(x))
  50.  
  51. sentiment_map = {'negative':0, 'neutral':1, 'positive':2}
  52. self.df['airline_sentiment'] = self.df['airline_sentiment'].map(sentiment_map)
  53. self.clean_text(text_preprocessor)
  54. self.df = self.df[~self.df['clean_text'].apply(self.is_not_ascii)]
  55. self.df = self.df[pd.notnull(self.df['clean_text'])]
  56.  
  57. def train(self):
  58. embed_dict = self.create_word_embeddings_dict()
  59. vocab_size = 100000 #1193514 len(embed_dict.keys())
  60. print('VOCAB SIZE:', vocab_size)
  61. max_len = MAX_LEN
  62.  
  63. X = self.text_to_word_embeddings(self.df['clean_text'].values, vocab_size)
  64. y = np_utils.to_categorical(self.df['airline_sentiment'].values)
  65. X_train, X_val, Y_train, Y_val = train_test_split(X,y, test_size = 0.3, random_state = 42)
  66.  
  67. #embedding matrix
  68. self.embed_matrix = np.zeros((vocab_size, EMBED_DIMS))
  69. for w, i in self.tokenizer.word_index.items():
  70. if i < vocab_size:
  71. vect = embed_dict.get(w)
  72. if vect is not None:
  73. self.embed_matrix[i] = vect
  74. else:
  75. break
  76.  
  77. print('embedding matrix shape:', self.embed_matrix.shape)
  78.  
  79. model = self.build_model()
  80.  
  81.  
  82. filepath="saved_models/{}".format(CHECKPOINT_FILE)
  83. checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
  84.  
  85. class_weight = self.get_weight(Y_train.flatten())
  86.  
  87. history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, Y_val), callbacks = [checkpoint], class_weight=class_weight)
  88.  
  89. model.save("saved_models/{}".format(MODEL_FILE))
  90.  
  91. score,acc = model.evaluate(X_val, Y_val, verbose = 2, batch_size = BATCH_SIZE)
  92. print("score: %.2f" % (score))
  93. print("acc: %.2f" % (acc))
  94.  
  95. self.plot_performance(history, 'saved_models')
  96.  
  97. # model2 = self.get_rnn_model()
  98. # history2 = model2.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, Y_val), callbacks = [checkpoint], class_weight=class_weight)
  99. # self.compare_models(history, history2)
  100.  
  101. def create_word_embeddings_dict(self):
  102. filename = "data/{}".format('glove.twitter.27B.200d.txt')
  103. emb_dict = {}
  104. glove = open(filename, 'r', encoding = "utf-8")
  105. for line in glove:
  106. values = line.split()
  107. word = values[0]
  108. vector = np.asarray(values[1:], dtype='float32')
  109. emb_dict[word] = vector
  110. glove.close()
  111. return emb_dict
  112.  
  113. def build_model(self):
  114. model = Sequential()
  115. # input_dim' = the vocab size that we will choose. In other words it is the number of unique words in the vocab.
  116. # 'output_dim' = the number of dimensions we wish to embed into. Each word will be represented by a vector of this much dimensions.
  117. # An example of shape of embeddings
  118. # The resulting shape is (3,12,8).
  119. # 3---> no of documents
  120. # 12---> each document is made of 12 words which was our maximum length of any document.
  121. # & 8---> each word is 8 dimensional.
  122. model.add(Embedding(input_dim=self.embed_matrix.shape[0], output_dim=self.embed_matrix.shape[1], input_length=MAX_LEN, weights=[self.embed_matrix], trainable=False))
  123.  
  124. #model.add(LSTM(EMBED_DIMS, dropout=0.2, recurrent_dropout=0.2))
  125. #model.add(Dense(3,activation='softmax'))
  126.  
  127. model.add(LSTM(EMBED_DIMS, return_sequences=True))
  128. model.add(LSTM(EMBED_DIMS, return_sequences=False))
  129. # if use_dropout:
  130. #model.add(Dropout(0.5))
  131. model.add(Dense(3,activation='softmax'))
  132.  
  133. # adam default parameters: lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0................................
  134. adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  135. model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
  136. model.summary()
  137. return model
  138.  
  139. def get_rnn_model(self):
  140. model = Sequential()
  141.  
  142. model.add(Embedding(input_dim=self.embed_matrix.shape[0], output_dim=self.embed_matrix.shape[1], input_length=MAX_LEN, weights=[self.embed_matrix], trainable=False))
  143. model.add(LSTM(EMBED_DIMS))
  144.  
  145. model.add(Dropout(0.5))
  146. model.add(BatchNormalization())
  147. model.add(Dropout(0.5))
  148.  
  149. model.add(Dense(512, activation='relu'))
  150.  
  151. model.add(Dropout(0.5))
  152. model.add(BatchNormalization())
  153. model.add(Dropout(0.5))
  154.  
  155. model.add(Dense(3,activation='softmax'))
  156. adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  157. model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
  158.  
  159. model.summary()
  160.  
  161. return model
  162.  
  163. def build_regularization_model(self):
  164. model = Sequential()
  165. model.add(Embedding(input_dim=self.embed_matrix.shape[0], output_dim=self.embed_matrix.shape[1], input_length=MAX_LEN, weights=[self.embed_matrix], trainable=False))
  166.  
  167. #model.add(LSTM(EMBED_DIMS, dropout=0.2, recurrent_dropout=0.2))
  168. #model.add(Dense(3,activation='softmax'))
  169.  
  170. model.add(LSTM(EMBED_DIMS))
  171. reg_model.add(layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(self.embed_matrix.shape[0],)))
  172. reg_model.add(layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='relu'))
  173.  
  174. model.add(Dense(3,activation='softmax'))
  175.  
  176. # adam default parameters: lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0................................
  177. adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
  178. model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
  179. model.summary()
  180. return model
  181.  
  182.  
  183. def compare_models(self, h1, h2):
  184. loss_base_model = h1.history['val_loss']
  185. loss_model = h2.history['val_loss']
  186.  
  187. e = range(1, EPOCHS + 1)
  188.  
  189. plt.plot(e, loss_base_model, 'bo', label='Validation Loss Model1')
  190. plt.plot(e, loss_model, 'b', label='Validation Loss Model2')
  191. plt.legend()
  192. plt.show()
  193. plt.savefig("saved_models/compare_models.png")
  194.  
  195. def get_weight(self, y):
  196. class_weight_current = cw.compute_class_weight('balanced', np.unique(y), y)
  197. return class_weight_current
  198.  
  199. def predict_single_text(self, text):
  200. model = load_model("saved_models/{}".format(MODEL_FILE))
  201. with open("saved_models/{}".format(VECTORIZER_FILE), 'rb') as f2:
  202. vect = pickle.load(f2)
  203. sequences = vect.texts_to_sequences([text])
  204. X_test = pad_sequences(sequences, maxlen=MAX_LEN)
  205. print('predict...')
  206. pred = model.predict(X_test)[0]
  207. prob_map = ['NEGATIVE', 'NEUTRAL', 'POSITIVE']
  208. print('****************')
  209. print(prob_map[np.argmax(pred)])
  210. print('****************')
  211.  
  212.  
  213. def text_to_word_embeddings(self, texts, vocab_size):
  214. self.tokenizer = Tokenizer(num_words=vocab_size)
  215. self.tokenizer.fit_on_texts(texts)
  216.  
  217. sequences = self.tokenizer.texts_to_sequences(texts)
  218.  
  219. x_train = pad_sequences(sequences, maxlen=MAX_LEN)
  220.  
  221. with open("saved_models/{}".format(VECTORIZER_FILE), 'wb') as handle:
  222. pickle.dump(self.tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)
  223. print ('tokenizer saved')
  224.  
  225. return x_train
  226.  
  227. def is_not_ascii(self, string):
  228. return string is not None and any([ord(s) >= 128 for s in string])
  229.  
  230. def plot_performance(self, history=None, figure_directory=None, ylim_pad=[0, 0]):
  231. xlabel = 'Epoch'
  232. legends = ['Training', 'Validation']
  233.  
  234. plt.figure(figsize=(20, 5))
  235.  
  236. y1 = history.history['acc']
  237. y2 = history.history['val_acc']
  238.  
  239. min_y = min(min(y1), min(y2))-ylim_pad[0]
  240. max_y = max(max(y1), max(y2))+ylim_pad[0]
  241.  
  242.  
  243. plt.subplot(121)
  244.  
  245. plt.plot(y1)
  246. plt.plot(y2)
  247. date_time = 'Timestamp: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())
  248. plt.title('Model Accuracy\n'+ date_time, fontsize=17)
  249. plt.xlabel(xlabel, fontsize=15)
  250. plt.ylabel('Accuracy', fontsize=15)
  251. plt.ylim(min_y, max_y)
  252. plt.legend(legends, loc='upper left')
  253. plt.grid()
  254.  
  255. y1 = history.history['loss']
  256. y2 = history.history['val_loss']
  257.  
  258. min_y = min(min(y1), min(y2))-ylim_pad[1]
  259. max_y = max(max(y1), max(y2))+ylim_pad[1]
  260.  
  261.  
  262. plt.subplot(122)
  263.  
  264. plt.plot(y1)
  265. plt.plot(y2)
  266.  
  267. plt.title('Model Loss\n'+date_time, fontsize=17)
  268. plt.xlabel(xlabel, fontsize=15)
  269. plt.ylabel('Loss', fontsize=15)
  270. plt.ylim(min_y, max_y)
  271. plt.legend(legends, loc='upper left')
  272. plt.grid()
  273. if figure_directory:
  274. plt.savefig(figure_directory+"/history")
  275.  
  276. plt.show()
  277.  
  278. def metrics(self):
  279. pos_count = self.df[self.df.airline_sentiment == 'positive']['tweet_id'].count()
  280. neg_count = self.df[self.df.airline_sentiment == 'negative']['tweet_id'].count()
  281. neut_count = self.df[self.df.airline_sentiment == 'neutral']['tweet_id'].count()
  282.  
  283. print("tot: {}, pos: {}, neutr:{}, neg: {}".format(self.df.shape[0], pos_count, neut_count, neg_count))
  284. print('max sentence length', MAX_LEN)
  285.  
  286. #length of tweets
  287.  
  288. reviews_len = self.df['max_len'].values #[len(x) for x in reviews_int]
  289. pd.Series(reviews_len).hist()
  290. #plt.show()
  291. print(pd.Series(reviews_len).describe())
  292. plt.savefig('tweets_len.png')
  293.  
  294. def get_data(self):
  295. return pd.read_csv('data/Tweets.csv')
  296.  
  297. def clean_text(self, text_preprocessor):
  298. clean_text = text_preprocessor.pre_process(self.df['text'])
  299. self.df['clean_text'] = clean_text
  300.  
  301. class TextPreprocessor:
  302. def __init__(self):
  303. with open('abbreviation.json', 'r') as f:
  304. self.abbr = json.load(f)
  305.  
  306. def pre_process(self, data):
  307. return data.apply(self.pre_process_text)
  308.  
  309. def pre_process_text(self, text):
  310. stops = set(stopwords.words("english"))
  311. text = text.lower() # lower case
  312. text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split()) # remove links urls
  313.  
  314. # convert arent' to are not
  315. words = text.split()
  316. words = [self.abbr[word] if word in self.abbr else word for word in words]
  317. text = " ".join(words)
  318.  
  319. #remove punctuation
  320. tokens = nltk.word_tokenize(text)
  321. words = [word for word in tokens if word.isalpha()]
  322. #remove stop words
  323. words = [w for w in words if not w in stops]
  324.  
  325. #lemmatization
  326. wordnet_lemmatizer = WordNetLemmatizer()
  327. words = [wordnet_lemmatizer.lemmatize(t) for t in words]
  328.  
  329. words = text.split()
  330. return ' '.join(words)
  331.  
  332.  
  333. tp = TextPreprocessor()
  334. a = AirlineSentiment(tp)
  335. # a.metrics()
  336.  
  337. a.train()
  338. # a.predict_single_text("It's a disgrace!")
  339.  
  340. # # debug Word2VecCreator
  341. # docs = ['the cat sat on the bench', 'anarchism originated as a term of abuse']
  342. # wv.train(docs)
  343.  
  344.  
  345. ## LATEST SCORE
  346. # Epoch 00010: val_acc did not improve
  347. # score: 0.58
  348. # acc: 0.81
  349.  
  350. class AirlineSentimentPredict:
  351. def __init__(self, tp, filename, col_name):
  352. self.model = load_model("saved_models/{}".format(MODEL_FILE))
  353. with open("saved_models/{}".format(VECTORIZER_FILE), 'rb') as f2:
  354. self.vect = pickle.load(f2)
  355.  
  356. self.df = pd.read_csv(filename)
  357. self.df['max_len'] = self.df[col_name].apply(lambda x: len(x))
  358.  
  359. self.col_name = col_name
  360. self.df['clean_text'] = tp.pre_process(self.df[col_name])
  361.  
  362. #self.df['max_len'].max()
  363.  
  364. def predict(self):
  365. sequences = self.vect.texts_to_sequences(self.df['clean_text'].values)
  366. X_test = pad_sequences(sequences, maxlen=MAX_LEN)
  367. print('predict...')
  368.  
  369. preds = self.model.predict(X_test)
  370. y_preds = [self.prob_to_sentiment_label(pred) for pred in preds]
  371.  
  372. prob_map = ['negative', 'neutral', 'positive']
  373.  
  374. probs = []
  375. for pred in preds:
  376. di = {}
  377. for i, prob in enumerate(pred):
  378. di[prob_map[i]] = prob
  379. probs.append(di)
  380.  
  381. ##probs = ["{}:{}".format(prob_map[i[0]], prob) for i, prob in enumerate(preds)]
  382.  
  383. self.df['pred'] = y_preds
  384. self.df['prob'] = probs
  385.  
  386. submission = self.df[[self.col_name, 'pred', 'prob']]
  387. timestr = time.strftime("%Y%m%d-%H%M%S")
  388. submission.to_csv("predictions-{}.csv".format(timestr))
  389.  
  390. def prob_to_sentiment_label(self, pred):
  391. #THRESHOLD = .4
  392. #return 0 if pred[0] > THRESHOLD else 1
  393.  
  394. return np.argmax(pred)
  395.  
  396. # p = AirlineSentimentPredict(tp, 'data/test.csv', 'Snippet')
  397. # p.predict()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement