Untitled

import logging
import json
import pandas as pd
import numpy as np
from pprint import pprint
from time import time
from os import getcwd, listdir
from os.path import isfile, join
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint as sp_randint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
# Logging config
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def get_series_ids(x):
    '''Function returns a pandas series consisting of ids,
       corresponding to objects in input pandas series x
       Example:
       get_series_ids(pd.Series(['a','a','b','b','c']))
       returns Series([0,0,1,1,2], dtype=int)'''

    values = np.unique(x)
    values2nums = dict(zip(values, range(len(values))))
    return x.replace(values2nums)

# Getting all preprocessed records
records = []
rec_dir = '\\labeled_data\\preprocessed\\'
mypath = getcwd() + rec_dir
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
logging.info(f'Loading preprocessed records')
for file in onlyfiles:
    logging.info(f'Loading file {file}')
    with open('labeled_data\\preprocessed\\' + file, 'r') as infile:
        for line in infile:
            records.append(line)
df = pd.DataFrame.from_records(map(json.loads, records))
logging.info(f'Finished loading records')

# converting bot/user/streamer etc label to numerical feature
df['class'] = df['class'].map({'user': 0,
                                'crowd': 1,
                                'streamer': 2,
                                'bot': 3,
                                'other': 4})
df['most_freq_POS_tag'] =  get_series_ids(df['most_freq_POS_tag'])

# ensuring all of the data is represented as floats
df.lastSec = df.lastSec.astype(float)
df.last5Sec = df.last5Sec.astype(float)
df.last10Sec = df.last10Sec.astype(float)
df.user_mention = df.user_mention.astype(float)
df.bot_command = df.bot_command.astype(float)
df.emote_ratio = df.emote_ratio.astype(float)

# removing na values and resetting index for sklearn
df = df.dropna()
df = df.reset_index()

# Baseline metrics from pandas (mean, median, mode, min, max, std)
tallies = df.groupby(['class'])['class'].count()
total_msg_count = int(df[['class']].count().to_string().split()[1])
user_msg_count = int(tallies[0])
crowd_msg_count = int(tallies[1])
streamer_msg_count = int(tallies[2])
bot_msg_count = int(tallies[3])
other_msg_count = int(tallies[4])

user_msg_pct = (user_msg_count / total_msg_count) * 100
crowd_msg_pct = (crowd_msg_count / total_msg_count) * 100
streamer_msg_pct = (streamer_msg_count / total_msg_count) * 100
bot_msg_pct = (bot_msg_count / total_msg_count) * 100
other_msg_pct = (other_msg_count / total_msg_count) * 100

summary_df = pd.DataFrame([user_msg_pct, crowd_msg_pct,
                           streamer_msg_pct, bot_msg_pct,
                           other_msg_pct], columns=['pct_total_msg'])

logging.debug(f'TALLIES: {tallies}')
logging.debug(f'\nCOUNTS:\nTotal messages: {total_msg_count};\nUser messages: {user_msg_count};\nCrowd messages: {crowd_msg_count};\nStreamer messages: {streamer_msg_count};\nBot messages: {bot_msg_count};\nOther {other_msg_count}\n')
logging.debug(summary_df)

# getting train data & the column we want to predict
cc_model_data_vals = df[['user_mention','bot_command','emote_ratio','cc_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
bert_model_data_vals = df[['user_mention','bot_command','emote_ratio','bert_embedding','most_freq_POS_tag', 'lastSec', 'last5Sec', 'last10Sec']].values
label_data_vals = df[['class']].values.ravel()


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
    return results['params'][1]


def run_classifier(model_data_vals, label_data_vals, model):
    # train & test data
    X_train, X_test, y_train, y_test = train_test_split(model_data_vals,
                                                        label_data_vals,
                                                        test_size=.4)
    if model == 'mlp':
        # Running Multi-layer Perceptron hyperparameterization and gathering metrics
        param_dist = {
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'batch_size': sp_randint(1, 11),
        'learning_rate': ['constant', 'invscaling', 'adaptive']}
        n_iter_search = 20
        clf = MLPClassifier()
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                        n_iter=n_iter_search, cv=10)
        start = time()
        random_search.fit(X_train, y_train)

        print("RandomizedSearchCV took %.2f seconds for %d candidates"
            " parameter settings." % ((time() - start), n_iter_search))
        params = report(random_search.cv_results_)

        best_clf = MLPClassifier(**params)
        best_clf.fit(X_train, y_train)
        predictions_array = best_clf.predict(X_test)
    else:
        # Running kNN hyperparamatization and metrics
        n_iter_search = 20
        neighbors = 5
        param_dist = {
            'n_neighbors': sp_randint(5, 11),
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': sp_randint(30, 50),
        }
        clf = KNeighborsClassifier()
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                        n_iter=n_iter_search, cv=10)
        start = time()
        random_search.fit(X_train, y_train)

        print("RandomizedSearchCV took %.2f seconds for %d candidates"
            " parameter settings." % ((time() - start), n_iter_search))
        params = report(random_search.cv_results_)

        best_clf = KNeighborsClassifier(**params)
        best_clf.fit(X_train, y_train)
        predictions_array = best_clf.predict(X_test)

    true_pos = 0
    true_pos_user = 0
    false_pos_user = 0
    false_pos = 0
    false_neg_user = 0
    pred_df = pd.DataFrame({'test': y_test, 'pred': predictions_array})

    for i, row in pred_df.iterrows():
        #logging.debug(f'row: {row}')
        if row[0] == 0.0 or row[1] == 0.0:
            if row[1] == row[0]:
                    true_pos_user += 1
        elif row[0] == 0.0:
            false_pos_user += 1
        else:
            false_neg_user += 1
    return predictions_array, pred_df, best_clf, X_test, y_test, [true_pos_user, false_pos_user, false_neg_user]

def get_summary_stats(predictions_array, predictions_df, classifier, X_test, y_test, nums):
    predictions_df = pd.DataFrame(predictions_array, columns=['pred_class'])
    pprint(predictions_df.groupby(['pred_class'])['pred_class'].count())
    pred_total_msg_count = int(predictions_df.count().to_string().split()[1])
    pred_user_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[0])
    pred_crowd_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[1])
    pred_streamer_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[2])
    pred_bot_msg_count = int(predictions_df.groupby(['pred_class'])['pred_class'].count()[3])

    pred_user_msg_pct = (pred_user_msg_count / pred_total_msg_count) * 100
    pred_crowd_msg_pct = (pred_crowd_msg_count / pred_total_msg_count) * 100
    pred_streamer_msg_pct = (pred_streamer_msg_count / pred_total_msg_count) * 100
    pred_bot_msg_pct = (pred_bot_msg_count / pred_total_msg_count) * 100

    pred_summary_df = pd.DataFrame([pred_user_msg_pct, pred_crowd_msg_pct,
                            pred_streamer_msg_pct, pred_bot_msg_pct], columns=['pred_pct_total_msg'])

    pprint(pred_summary_df)

    # Model Statistics

    # Model Precision as provided from sklearn
    mean_accuracy_score = classifier.score(X_test, y_test) * 100
    pprint(mean_accuracy_score)

    # Precision on classifying only 'user' messages
    precision = nums[0] / (nums[0] + nums[1])
    recall = nums[0] / (nums[0] + nums[2])

    pprint("True pos user: " + str(nums[0]))
    pprint("False pos user: " + str(nums[1]))
    pprint("False neg user: " + str(nums[2]))
    pprint("Precision(USER): " + str(precision))
    pprint("Recall(USER): " + str(recall))

logging.debug('BERT RESULTS MLP')
cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)

logging.debug('CC RESULTS MLP')
bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)

logging.debug('BERT RESULTS kNN')
cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums = run_classifier(cc_model_data_vals, label_data_vals, 'mlp')
get_summary_stats(cc_predictions_array, cc_pred_df, cc_classifier, cc_X_test, cc_y_test, nums)

logging.debug('CC RESULTS kNN')
bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums = run_classifier(bert_model_data_vals, label_data_vals, 'mlp')
get_summary_stats(bert_predictions_array, bert_pred_df, bert_classifier, bert_X_test, bert_y_test, nums)