Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- import json
- import pandas as pd
- import numpy as np
- from pprint import pprint
- from time import time
- from os import getcwd, listdir
- from os.path import isfile, join
- from sklearn.neural_network import MLPClassifier
- from scipy.stats import randint as sp_randint
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score, average_precision_score
- from sklearn.model_selection import RandomizedSearchCV
- from sklearn.neighbors import KNeighborsClassifier
- # Logging config
- logging.basicConfig(level=logging.DEBUG)
- logger = logging.getLogger(__name__)
- def get_series_ids(x):
- '''Function returns a pandas series consisting of ids,
- corresponding to objects in input pandas series x
- Example:
- get_series_ids(pd.Series(['a','a','b','b','c']))
- returns Series([0,0,1,1,2], dtype=int)'''
- values = np.unique(x)
- values2nums = dict(zip(values, range(len(values))))
- return x.replace(values2nums)
- # Getting all preprocessed records
- records = []
- rec_dir = '\\labeled_data\\preprocessed\\'
- mypath = getcwd() + rec_dir
- onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
- logging.info(f'Loading preprocessed records')
- for file in onlyfiles:
- logging.info(f'Loading file {file}')
- with open('labeled_data\\preprocessed\\' + file, 'r') as infile:
- for line in infile:
- records.append(line)
- df = pd.DataFrame.from_records(map(json.loads, records))
- logging.info(f'Finished loading records')
- # converting bot/user/streamer etc label to numerical feature
- df['class'] = df['class'].map({'user': 0,
- 'crowd': 1,
- 'streamer': 2,
- 'bot': 3,
- 'other': 4})
- df['most_freq_POS_tag'] = get_series_ids(df['most_freq_POS_tag'])
- # ensuring all of the data is represented as floats
- df.lastSec = df.lastSec.astype(float)
- df.last5Sec = df.last5Sec.astype(float)
- df.last10Sec = df.last10Sec.astype(float)
- df.user_mention = df.user_mention.astype(float)
- df.bot_command = df.bot_command.astype(float)
- df.emote_ratio = df.emote_ratio.astype(float)
- # removing na values and resetting index for sklearn
- df = df.dropna()
- df = df.reset_index()
- # Baseline metrics from pandas (mean, median, mode, min, max, std)
- tallies = df.groupby(['class'])['class'].count()
- total_msg_count = int(df[['class']].count().to_string().split()[1])
- user_msg_count = int(tallies[0])
- crowd_msg_count = int(tallies[1])
- streamer_msg_count = int(tallies[2])
- bot_msg_count = int(tallies[3])
- other_msg_count = int(tallies[4])
- user_msg_pct = (user_msg_count / total_msg_count) * 100
- crowd_msg_pct = (crowd_msg_count / total_msg_count) * 100
- streamer_msg_pct = (streamer_msg_count / total_msg_count) * 100
- bot_msg_pct = (bot_msg_count / total_msg_count) * 100
- other_msg_pct = (other_msg_count / total_msg_count) * 100
- summary_df = pd.DataFrame([user_msg_pct, crowd_msg_pct,
- streamer_msg_pct, bot_msg_pct,
- other_msg_pct], columns=['pct_total_msg'])
- logging.debug(f'TALLIES: {tallies}')
- logging.debug(f'\nCOUNTS:\nTotal messages: {total_msg_count};\nUser messages: {user_msg_count};\nCrowd messages: {crowd_msg_count};\nStreamer messages: {streamer_msg_count};\nBot messages: {bot_msg_count};\nOther {other_msg_count}\n')
- logging.debug(summary_df)
- # getting train data & the column we want to predict
- cc_model_data_vals = df[['user_mention','bot_command','emote_ratio','cc_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
- bert_model_data_vals = df[['user_mention','bot_command','emote_ratio','bert_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
- label_data_vals = df[['class']].values.ravel()
- # Utility function to report best scores
- def report(results, n_top=3):
- for i in range(1, n_top + 1):
- candidates = np.flatnonzero(results['rank_test_score'] == i)
- for candidate in candidates:
- print("Model with rank: {0}".format(i))
- print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
- results['mean_test_score'][candidate],
- results['std_test_score'][candidate]))
- print("Parameters: {0}".format(results['params'][candidate]))
- print("")
- return results['params'][1]
- def run_classifier(model_data_vals, label_data_vals, model):
- # train & test data
- X_train, X_test, y_train, y_test = train_test_split(model_data_vals,
- label_data_vals,
- test_size=.4)
- if model == 'mlp':
- # Running Multi-layer Perceptron hyperparameterization and gathering metrics
- param_dist = {
- 'activation': ['identity', 'logistic', 'tanh', 'relu'],
- 'solver': ['lbfgs', 'sgd', 'adam'],
- 'batch_size': sp_randint(1, 11),
- 'learning_rate': ['constant', 'invscaling', 'adaptive']}
- n_iter_search = 20
- clf = MLPClassifier()
- random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
- n_iter=n_iter_search, cv=10)
- start = time()
- random_search.fit(X_train, y_train)
- print("RandomizedSearchCV took %.2f seconds for %d candidates"
- " parameter settings." % ((time() - start), n_iter_search))
- params = report(random_search.cv_results_)
- best_clf = MLPClassifier(**params)
- best_clf.fit(X_train, y_train)
- predictions_array = best_clf.predict(X_test)
- else:
- # Running kNN hyperparamatization and metrics
- n_iter_search = 20
- neighbors = 5
- param_dist = {
- 'n_neighbors': sp_randint(5, 11),
- 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
- 'leaf_size': sp_randint(30, 50),
- }
- clf = KNeighborsClassifier()
- random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
- n_iter=n_iter_search, cv=10)
- start = time()
- random_search.fit(X_train, y_train)
- print("RandomizedSearchCV took %.2f seconds for %d candidates"
- " parameter settings." % ((time() - start), n_iter_search))
- params = report(random_search.cv_results_)
- best_clf = KNeighborsClassifier(**params)
- best_clf.fit(X_train, y_train)
- predictions_array = best_clf.predict(X_test)
- true_pos = 0
- true_pos_user = 0
- false_pos_user = 0
- false_pos = 0
- false_neg_user = 0
- pred_df = pd.DataFrame({'test': y_test, 'pred': predictions_array})
- for i, row in pred_df.iterrows():
- #logging.debug(f'row: {row}')
- if row[0] == 0.0 or row[1] == 0.0:
- if row[1] == row[0]:
- true_pos_user += 1
- elif row[0] == 0.0:
- false_pos_user += 1
- else:
- false_neg_user += 1
- return predictions_array, pred_df, best_clf, X_test, y_test, [true_pos_user, false_pos_user, false_neg_user]
- def get_summary_stats(predictions_array, predictions_df, classifier, X_test, y_test, nums):
- predictions_df = pd.DataFrame(predictions_array, columns=['pred_class'])
- pprint(predictions_df.groupby(['pred_class'])['pred_class'].count())
- pred_total_msg_count = int(predictions_df.count().to_string().split()[1])
- pred_user_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[0])
- pred_crowd_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[1])
- pred_streamer_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[2])
- pred_bot_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[3])
- pred_user_msg_pct = (pred_user_msg_count / pred_total_msg_count) * 100
- pred_crowd_msg_pct = (pred_crowd_msg_count / pred_total_msg_count) * 100
- pred_streamer_msg_pct = (pred_streamer_msg_count / pred_total_msg_count) * 100
- pred_bot_msg_pct = (pred_bot_msg_count / pred_total_msg_count) * 100
- pred_summary_df = pd.DataFrame([pred_user_msg_pct, pred_crowd_msg_pct,
- pred_streamer_msg_pct, pred_bot_msg_pct], columns=['pred_pct_total_msg'])
- pprint(pred_summary_df)
- # Model Statistics
- # Model Precision as provided from sklearn
- mean_accuracy_score = classifier.score(X_test, y_test) * 100
- pprint(mean_accuracy_score)
- # Precision on classifying only 'user' messages
- precision = nums[0] / (nums[0] + nums[1])
- recall = nums[0] / (nums[0] + nums[2])
- pprint("True pos user: " + str(nums[0]))
- pprint("False pos user: " + str(nums[1]))
- pprint("False neg user: " + str(nums[2]))
- pprint("Precision(USER): " + str(precision))
- pprint("Recall(USER): " + str(recall))
- logging.debug('BERT RESULTS MLP')
- cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
- get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)
- logging.debug('CC RESULTS MLP')
- bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
- get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)
- logging.debug('BERT RESULTS kNN')
- cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
- get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)
- logging.debug('CC RESULTS kNN')
- bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
- get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement