Advertisement
Guest User

Untitled

a guest
Apr 21st, 2019
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.96 KB | None | 0 0
  1. import logging
  2. import json
  3. import pandas as pd
  4. import numpy as np
  5. from pprint import pprint
  6. from time import time
  7. from os import getcwd, listdir
  8. from os.path import isfile, join
  9. from sklearn.neural_network import MLPClassifier
  10. from scipy.stats import randint as sp_randint
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.metrics import accuracy_score, average_precision_score
  13. from sklearn.model_selection import RandomizedSearchCV
  14. from sklearn.neighbors import KNeighborsClassifier
  15. # Logging config
  16. logging.basicConfig(level=logging.DEBUG)
  17. logger = logging.getLogger(__name__)
  18.  
  19. def get_series_ids(x):
  20. '''Function returns a pandas series consisting of ids,
  21. corresponding to objects in input pandas series x
  22. Example:
  23. get_series_ids(pd.Series(['a','a','b','b','c']))
  24. returns Series([0,0,1,1,2], dtype=int)'''
  25.  
  26. values = np.unique(x)
  27. values2nums = dict(zip(values, range(len(values))))
  28. return x.replace(values2nums)
  29.  
  30. # Getting all preprocessed records
  31. records = []
  32. rec_dir = '\\labeled_data\\preprocessed\\'
  33. mypath = getcwd() + rec_dir
  34. onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
  35. logging.info(f'Loading preprocessed records')
  36. for file in onlyfiles:
  37. logging.info(f'Loading file {file}')
  38. with open('labeled_data\\preprocessed\\' + file, 'r') as infile:
  39. for line in infile:
  40. records.append(line)
  41. df = pd.DataFrame.from_records(map(json.loads, records))
  42. logging.info(f'Finished loading records')
  43.  
  44. # converting bot/user/streamer etc label to numerical feature
  45. df['class'] = df['class'].map({'user': 0,
  46. 'crowd': 1,
  47. 'streamer': 2,
  48. 'bot': 3,
  49. 'other': 4})
  50. df['most_freq_POS_tag'] = get_series_ids(df['most_freq_POS_tag'])
  51.  
  52. # ensuring all of the data is represented as floats
  53. df.lastSec = df.lastSec.astype(float)
  54. df.last5Sec = df.last5Sec.astype(float)
  55. df.last10Sec = df.last10Sec.astype(float)
  56. df.user_mention = df.user_mention.astype(float)
  57. df.bot_command = df.bot_command.astype(float)
  58. df.emote_ratio = df.emote_ratio.astype(float)
  59.  
  60. # removing na values and resetting index for sklearn
  61. df = df.dropna()
  62. df = df.reset_index()
  63.  
  64. # Baseline metrics from pandas (mean, median, mode, min, max, std)
  65. tallies = df.groupby(['class'])['class'].count()
  66. total_msg_count = int(df[['class']].count().to_string().split()[1])
  67. user_msg_count = int(tallies[0])
  68. crowd_msg_count = int(tallies[1])
  69. streamer_msg_count = int(tallies[2])
  70. bot_msg_count = int(tallies[3])
  71. other_msg_count = int(tallies[4])
  72.  
  73. user_msg_pct = (user_msg_count / total_msg_count) * 100
  74. crowd_msg_pct = (crowd_msg_count / total_msg_count) * 100
  75. streamer_msg_pct = (streamer_msg_count / total_msg_count) * 100
  76. bot_msg_pct = (bot_msg_count / total_msg_count) * 100
  77. other_msg_pct = (other_msg_count / total_msg_count) * 100
  78.  
  79. summary_df = pd.DataFrame([user_msg_pct, crowd_msg_pct,
  80. streamer_msg_pct, bot_msg_pct,
  81. other_msg_pct], columns=['pct_total_msg'])
  82.  
  83. logging.debug(f'TALLIES: {tallies}')
  84. logging.debug(f'\nCOUNTS:\nTotal messages: {total_msg_count};\nUser messages: {user_msg_count};\nCrowd messages: {crowd_msg_count};\nStreamer messages: {streamer_msg_count};\nBot messages: {bot_msg_count};\nOther {other_msg_count}\n')
  85. logging.debug(summary_df)
  86.  
  87. # getting train data & the column we want to predict
  88. cc_model_data_vals = df[['user_mention','bot_command','emote_ratio','cc_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
  89. bert_model_data_vals = df[['user_mention','bot_command','emote_ratio','bert_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
  90. label_data_vals = df[['class']].values.ravel()
  91.  
  92.  
  93. # Utility function to report best scores
  94. def report(results, n_top=3):
  95. for i in range(1, n_top + 1):
  96. candidates = np.flatnonzero(results['rank_test_score'] == i)
  97. for candidate in candidates:
  98. print("Model with rank: {0}".format(i))
  99. print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
  100. results['mean_test_score'][candidate],
  101. results['std_test_score'][candidate]))
  102. print("Parameters: {0}".format(results['params'][candidate]))
  103. print("")
  104. return results['params'][1]
  105.  
  106.  
  107. def run_classifier(model_data_vals, label_data_vals, model):
  108. # train & test data
  109. X_train, X_test, y_train, y_test = train_test_split(model_data_vals,
  110. label_data_vals,
  111. test_size=.4)
  112. if model == 'mlp':
  113. # Running Multi-layer Perceptron hyperparameterization and gathering metrics
  114. param_dist = {
  115. 'activation': ['identity', 'logistic', 'tanh', 'relu'],
  116. 'solver': ['lbfgs', 'sgd', 'adam'],
  117. 'batch_size': sp_randint(1, 11),
  118. 'learning_rate': ['constant', 'invscaling', 'adaptive']}
  119. n_iter_search = 20
  120. clf = MLPClassifier()
  121. random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
  122. n_iter=n_iter_search, cv=10)
  123. start = time()
  124. random_search.fit(X_train, y_train)
  125.  
  126. print("RandomizedSearchCV took %.2f seconds for %d candidates"
  127. " parameter settings." % ((time() - start), n_iter_search))
  128. params = report(random_search.cv_results_)
  129.  
  130. best_clf = MLPClassifier(**params)
  131. best_clf.fit(X_train, y_train)
  132. predictions_array = best_clf.predict(X_test)
  133. else:
  134. # Running kNN hyperparamatization and metrics
  135. n_iter_search = 20
  136. neighbors = 5
  137. param_dist = {
  138. 'n_neighbors': sp_randint(5, 11),
  139. 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
  140. 'leaf_size': sp_randint(30, 50),
  141. }
  142. clf = KNeighborsClassifier()
  143. random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
  144. n_iter=n_iter_search, cv=10)
  145. start = time()
  146. random_search.fit(X_train, y_train)
  147.  
  148. print("RandomizedSearchCV took %.2f seconds for %d candidates"
  149. " parameter settings." % ((time() - start), n_iter_search))
  150. params = report(random_search.cv_results_)
  151.  
  152. best_clf = KNeighborsClassifier(**params)
  153. best_clf.fit(X_train, y_train)
  154. predictions_array = best_clf.predict(X_test)
  155.  
  156. true_pos = 0
  157. true_pos_user = 0
  158. false_pos_user = 0
  159. false_pos = 0
  160. false_neg_user = 0
  161. pred_df = pd.DataFrame({'test': y_test, 'pred': predictions_array})
  162.  
  163. for i, row in pred_df.iterrows():
  164. #logging.debug(f'row: {row}')
  165. if row[0] == 0.0 or row[1] == 0.0:
  166. if row[1] == row[0]:
  167. true_pos_user += 1
  168. elif row[0] == 0.0:
  169. false_pos_user += 1
  170. else:
  171. false_neg_user += 1
  172. return predictions_array, pred_df, best_clf, X_test, y_test, [true_pos_user, false_pos_user, false_neg_user]
  173.  
  174. def get_summary_stats(predictions_array, predictions_df, classifier, X_test, y_test, nums):
  175. predictions_df = pd.DataFrame(predictions_array, columns=['pred_class'])
  176. pprint(predictions_df.groupby(['pred_class'])['pred_class'].count())
  177. pred_total_msg_count = int(predictions_df.count().to_string().split()[1])
  178. pred_user_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[0])
  179. pred_crowd_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[1])
  180. pred_streamer_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[2])
  181. pred_bot_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[3])
  182.  
  183. pred_user_msg_pct = (pred_user_msg_count / pred_total_msg_count) * 100
  184. pred_crowd_msg_pct = (pred_crowd_msg_count / pred_total_msg_count) * 100
  185. pred_streamer_msg_pct = (pred_streamer_msg_count / pred_total_msg_count) * 100
  186. pred_bot_msg_pct = (pred_bot_msg_count / pred_total_msg_count) * 100
  187.  
  188. pred_summary_df = pd.DataFrame([pred_user_msg_pct, pred_crowd_msg_pct,
  189. pred_streamer_msg_pct, pred_bot_msg_pct], columns=['pred_pct_total_msg'])
  190.  
  191. pprint(pred_summary_df)
  192.  
  193. # Model Statistics
  194.  
  195. # Model Precision as provided from sklearn
  196. mean_accuracy_score = classifier.score(X_test, y_test) * 100
  197. pprint(mean_accuracy_score)
  198.  
  199. # Precision on classifying only 'user' messages
  200. precision = nums[0] / (nums[0] + nums[1])
  201. recall = nums[0] / (nums[0] + nums[2])
  202.  
  203. pprint("True pos user: " + str(nums[0]))
  204. pprint("False pos user: " + str(nums[1]))
  205. pprint("False neg user: " + str(nums[2]))
  206. pprint("Precision(USER): " + str(precision))
  207. pprint("Recall(USER): " + str(recall))
  208.  
  209. logging.debug('BERT RESULTS MLP')
  210. cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
  211. get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)
  212.  
  213. logging.debug('CC RESULTS MLP')
  214. bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
  215. get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)
  216.  
  217. logging.debug('BERT RESULTS kNN')
  218. cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
  219. get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)
  220.  
  221. logging.debug('CC RESULTS kNN')
  222. bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
  223. get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement