Machine Learning II - exercise 3 - GBM_XGBM_LightGBM_CatBoost_Final

# -*- coding: utf-8 -*-
"""
Created on Wed Nov  8 19:20:49 2023

@author: Lenovo
"""

from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
import numpy as np
from skfeature.function.similarity_based import fisher_score
from  sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.feature_selection import RFECV
import sklearn
from sklearn.feature_selection import RFE
from collections import Counter
from catboost import CatBoostClassifier
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import optuna
import functools
from imblearn.over_sampling import SMOTE


wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
x = wine_quality.data.features
y = wine_quality.data.targets

# Checking for null values
x.isnull().values.any()
y.isnull().values.any()

y.hist(bins=25,figsize=(10,10))
# Look like we have only observation starting from 3
# There are not many observations for quality 3 and 9.

df = pd.concat([x, y], axis=1)


# Let assume that wine with quality > 7 is regarded as high quality and anything between 5 and 6 is regarded as medium quality and below 5 is regarded as low quality
# We then group the target data into 3 groups:

df['quality_class'] = 0  # Initialize all values to 1 corresponding to low quality

# Let group wine quality into 3 groups: high, medium and low as follows:
df.loc[df['quality'] > 7, 'quality_class'] = 2 # high quality
df.loc[(df['quality'] >= 5) & (df['quality'] <= 6), 'quality_class'] = 1 # medium quality
df['quality_class'].unique()

df.drop('quality', inplace = True, axis = 1)

# we split data
X_train, X_test, y_train, y_test = train_test_split(df.drop(['quality_class'], axis = 1), df['quality_class'], test_size=0.2, random_state=1)

# Due to imbalanced dataset, we need to perform resampling in order to make sure that the representativeness of 3 class is equal

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

sns.countplot(x=y_train)
plt.show()

warnings.filterwarnings('ignore')
fig,ax = plt.subplots(12,2,figsize=(13,40))
for index,i in enumerate(df.drop(['quality_class'],axis = 1).columns):
    sns.distplot(df[i],ax=ax[index,0])
    sns.boxplot(df[i],ax=ax[index,1], linewidth = 1.2)
    plt.grid()
fig.tight_layout()
fig.subplots_adjust(top=0.95)

plt.suptitle("Visualization of Numeric Features",fontsize=20)

sns.pairplot(df,hue = 'quality_class', corner=True)

sns.heatmap(df.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
# alcohol and density have the highest correlation. This suggests that we might need to exclude 1 variable after feature selection

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

uniform_samples = np.random.uniform(0, 1, 8781)
X_train = pd.DataFrame(X_train, columns=df.drop(columns=['quality_class']).columns)
X_train['uniform_dist_var'] = uniform_samples
X_test = pd.DataFrame(X_test, columns=df.drop(columns=['quality_class']).columns)

y_train = y_train.reset_index()
y_train = y_train['quality_class']
y_test = y_test.reset_index()
y_test = y_test['quality_class']
df_train = pd.concat([X_train, y_train], axis=1)

# Checking distribution of all features
X_train.hist(bins=25,figsize=(10,10))
plt.show()

# Checking distribution of dependent variable

x_vars = df_train.drop(columns=['quality_class'], axis=1).columns

sns.heatmap(df_train.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)

# Feature Selection:

    # Information Gain:

ig = mutual_info_classif(X_train, y_train)
feature_scores = {}
j = 0
for i in x_vars:
    feature_scores[i] = ig[j]
    j=j+1
# Sort the features by importance score in descending order
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

for feature, score in sorted_features:
    print('Feature:', feature, 'Score:', score)

# Plot a horizontal bar chart of the feature importance scores
fig, ax = plt.subplots()
y_pos = np.arange(len(sorted_features))
ax.barh(y_pos, [score for feature, score in sorted_features], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels([feature for feature, score in sorted_features])
ax.invert_yaxis()  # Labels read top-to-bottom
ax.set_xlabel("Importance Score")
ax.set_title("Feature Importance Scores (Information Gain)")

# Add importance scores as labels on the horizontal bar chart
for i, v in enumerate([score for feature, score in sorted_features]):
    ax.text(v + 0.01, i, str(round(v, 3)), color="black", fontweight="bold")
plt.show()

    # Fisher score:
rank = fisher_score.fisher_score(X_train.to_numpy(), y_train.to_numpy(), mode='rank')

# Plotting the ranks
features_rank = {}
j = 0
for i in x_vars:
    features_rank[i] = rank[j]
    j=j+1
# Sort the features by importance score in descending order
features_rank_descending = sorted(features_rank.items(), key=lambda x: x[1], reverse=True)

features_rank_descending

# Rank (the higher value, the more important the variable):

# [('total_sulfur_dioxide', 11),
# ('alcohol', 10),
# ('sulphates', 9),
# ('free_sulfur_dioxide', 8),
# ('citric_acid', 7),
# ('density', 6),
# ('residual_sugar', 5),
# ('volatile_acidity', 4),
# ('chlorides', 3),
# ('pH', 2),
# ('fixed_acidity', 1),
# ('uniform_dist_var', 0)]


    # Feature selection using Random Forest:

number_of_random_states = 30
average_optimal = np.zeros(12)
scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')

i = 0
for rs in range(number_of_random_states):
    rf_classifier = rfc(random_state = rs)
    rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring=scorer)
    rfecv = rfecv.fit(X_train, y_train)
    average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
    i = i + 1
    print ('progress ' + str(round(i/number_of_random_states*100)) + '%')

average_optimal /= number_of_random_states
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
print("Number of features selected :", np.argmax(average_optimal)+1)
print("Evaluation of the optimal f1 :", np.max(average_optimal))
plt.show()

# out of 12 variables, only 11 improve the model performance. The last variable makes the performance deteriorates.
# I suspect it is the uniformly distributed variable (noise). Let's check if that variable appears in the top 11 variables.

most_appearing_features = []

for rs in range(10):
    rf_classifier = rfc(random_state=rs)
    rfe = RFE(estimator=rf_classifier, n_features_to_select=11, step=1)
    rfe = rfe.fit(X_train, y_train)
    most_appearing_features.append(X_train.columns[rfe.support_].tolist())
most_appearing_features = [item for sublist in most_appearing_features for item in sublist]

print('Most appearing features :')
Counter(most_appearing_features).most_common(11)

# [('volatile_acidity', 10),
# ('residual_sugar', 10),
# ('chlorides', 10),
# ('free_sulfur_dioxide', 10),
# ('total_sulfur_dioxide', 10),
# ('density', 10),
# ('pH', 10),
# ('sulphates', 10),
# ('alcohol', 10)]

# Looks like we need to exclude uniformly distributed variable as it does not show up in the list of top 11 most appearing variables
# All of the initial variables outperform noise variable in term of feature importance. As a result, we remove dummy noise variable

X_train.drop(['uniform_dist_var'], inplace = True, axis = 1)

##### Fitting data to different boosting models using hyperparameter tunning #####

# GBM:

def objective_gbc(trial, X, y):
    params = {'max_depth': trial.suggest_int('max_depth', 1, 10),
              'n_estimators': trial.suggest_int('n_estimators', 10, 500),
              'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e+0),
              'random_state': trial.suggest_int('random_state', 0, 0)
             }
    clf = make_pipeline(GradientBoostingClassifier(**params))
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

# XGBoost:

def objective_xgbc(trial, X, y):
    params = {'objective': trial.suggest_categorical('objective', ['multi:softmax']),
                  'random_state': trial.suggest_int('random_state', 0, 0),
                  'n_estimators': trial.suggest_int('n_estimators', 10, 500, step=10), # num_boosting_round
                  'max_depth': trial.suggest_int('max_depth', 1, 9),
                  'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
                  'learning_rate': trial.suggest_categorical('learning_rate', [0.1, 0.05, 0.01]), # eta
                  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
                  'min_split_loss': trial.suggest_float('min_split_loss', 1e-8, 1.0, log=True), # gamma
                  'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']), # , 'gblinear'
                  'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True), # lambda
                  'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0)# alpha
                 }
    clf = make_pipeline(XGBClassifier(**params, use_label_encoder=False, num_class = 3))
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

# CatBoost:

def objective_catboost(trial, X, y):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    clf = make_pipeline(CatBoostClassifier(**params, silent=True))
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

# Light Gradient Boosting:

def objective_lgbc(trial, X, y):
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        "verbosity": -1,
        "boosting_type": trial.suggest_categorical('boosting_type ', ['gbdt', 'dart']),
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    clf = make_pipeline(LGBMClassifier(**params))
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

# Histogram Gradient Boosting:

def objective_hgbc(trial, X, y):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_iter": 1000,
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 1, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 1e3, log=True),
        "max_bins": trial.suggest_int('max_bins', 2, 255)
    }
    clf = make_pipeline(HistGradientBoostingClassifier(**params))
    scores = cross_val_score(clf, X, y, cv=5)
    return np.mean(scores)

def calculate_score(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    #auc = roc_auc_score(y_test, y_pred, multi_class= 'ovo')
    f1 = f1_score(y_test, y_pred, average= 'weighted')
    recall = recall_score(y_test, y_pred, average= 'weighted')
    precision = precision_score(y_test, y_pred, average= 'weighted')

    return acc, f1, recall, precision

# Hyperparameter tunning:
study_gbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
study_gbc.optimize(functools.partial(objective_gbc, X=X_train, y=y_train), n_trials=30)

study_xgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
study_xgbc.optimize(functools.partial(objective_xgbc, X=X_train, y=y_train), n_trials=30)

study_catboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
study_catboost.optimize(functools.partial(objective_catboost, X=X_train, y=y_train), n_trials=30)

study_lgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
study_lgbc.optimize(functools.partial(objective_lgbc, X=X_train, y=y_train), n_trials=30)

study_hgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
study_hgbc.optimize(functools.partial(objective_hgbc, X=X_train, y=y_train), n_trials=30)

# Creating Classifiers using tunned parameters obtained from previous step:
gbc = GradientBoostingClassifier(**study_gbc.best_params)
xgbc = XGBClassifier(**study_xgbc.best_params, use_label_encoder=False)
catboost = CatBoostClassifier(**study_catboost.best_params)
lgbc = LGBMClassifier(**study_lgbc.best_params)
hgbc = HistGradientBoostingClassifier(**study_hgbc.best_params)

models = [gbc,xgbc,lgbc,gbc,hgbc]
names = ['Gradient Boosting Classifier', 'XGBoost Classifier', 'CatBoost Classifier',
         'Light GBM Classifier', 'Hist Gradient Boosting Classifier']

# Calculating various model performance's metrics
scorelist = {}


for name, classifier in zip(names, models):
    acc, f1, recall, precision = calculate_score(classifier, X_train, y_train, X_test, y_test)
    scorelist[name] = (acc, f1, recall, precision)

scorelist

# RESULT COMPARISON #

# 'Gradient Boosting Classifier': (acc = 0.8224489795918367, f1 =  0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
# 'XGBoost Classifier': (acc = 0.8040816326530612, f1 = 0.8014447959294754, recall = 0.8040816326530612, precision = 0.8000441051063351))
# 'CatBoost Classifier': (acc = 0.8142857142857143, f1 = 0.8073504595990978, recall = 0.8142857142857143, precision = 0.8035260754952066)
# 'Light GBM Classifier': (acc = 0.8224489795918367, f1 = 0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
# 'Hist Gradient Boosting Classifier': (acc = 0.8112244897959183, f1 = 0.806735193615864, recall = 0.8112244897959183, precision = 0.8035021325007075)

# Gradient Boosting Classifier and Light GBM Classifier have exactly the same result for some reasons
# Best overall performer(s): Gradient Boosting and Light GBM, followed by CatBoost, Hist Gradient Boosting and XGBoost
# Reason why XGBoost underperforms in this dataset perhaps because of regularization which reduces overfitting on training sample.
# More specifically, perhaps because the testing data closely resemble training data, any algorithm which penalizes overfitting
# might underperform compared to learning algorithm which has no or little regularization.
# Catboost underperforms compared to other learning algorithms (except XGBoost) perhaps due to explanatory variables are continuous rather than categorical.