Machine Learning II - exercise 2 - AdaBoost_GradientBoosting

# -*- coding: utf-8 -*-
"""
Created on Sat Oct 28 23:46:48 2023

@author: Lenovo
"""

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from  sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import statsmodels.api as sm
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn_genetic import GASearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from skopt import BayesSearchCV
import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve

precision_recall_curve
# Load breast cancer data
data = load_breast_cancer()
feature_names = data.feature_names
print(data.DESCR)

df = pd.DataFrame(data.data, columns=data.feature_names)

df.isnull().sum()
df.isna().sum()
df['target'] = data.target

df['target'].value_counts()[1] # number of target = 357
df['target'].value_counts()[0] # number of non target = 212

# Splitting the training vs. testing data based on 30-70% ratio
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)

# Feature scaling:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)
X_test = sc.transform(X_test)

X_train = pd.DataFrame(X_train, columns=df.drop(columns=['target']).columns)
X_val = pd.DataFrame(X_val, columns=df.drop(columns=['target']).columns)
X_test = pd.DataFrame(X_test, columns=df.drop(columns=['target']).columns)

f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X_train.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

# There are quite a lot of variables that are highly correlated => This suggests omission of some variables to avoid
# instability of a model on testing dataset.

# feature selection:

    # Using recursive feature elimination with random forest classifier to decide the max number of features for logistic regression
number_of_random_states = 30
average_optimal = np.zeros(30)

i = 0
for rs in range(number_of_random_states):
    rf_classifier = rfc(random_state = rs)
    rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring='f1')
    rfecv = rfecv.fit(X_train, y_train)
    average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
    i = i + 1
    print ('progress ' + str(round(i/number_of_random_states*100)) + '%')

average_optimal /= number_of_random_states
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
print("Number of features selected :", np.argmax(average_optimal)+1)
print("Evaluation of the optimal f1 :", np.max(average_optimal))
plt.show()

# Number of features selected : 15

    # List of top 15 features chosen by REFECV
most_appearing_features = []

for rs in range(10):
    rf_classifier = rfc(random_state=rs)
    rfe = RFE(estimator=rf_classifier, n_features_to_select=15, step=1)
    rfe = rfe.fit(X_train, y_train)
    most_appearing_features.append(X_train.columns[rfe.support_].tolist())
most_appearing_features = [item for sublist in most_appearing_features for item in sublist]

print('Most appearing features :')
Counter(most_appearing_features).most_common(15)

# [('mean radius', 10),
# ('mean texture', 10),
# ('mean area', 10),
# ('mean concavity', 10),
# ('mean concave points', 10),
# ('area error', 10),
# ('worst radius', 10),
# ('worst texture', 10),
# ('worst perimeter', 10),
# ('worst area', 10),
# ('worst smoothness', 10),
# ('worst concavity', 10),
# ('worst concave points', 10),
# ('mean perimeter', 9),
# ('worst compactness', 9)]

# Construct correlation matrix of the top 15 features:
X_train_selected = X_train[['mean radius', 'mean texture', 'mean area', 'mean concavity', 'mean concave points',
                            'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
                            'worst concavity', 'worst concave points', 'mean perimeter', 'worst compactness']]


f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X_train_selected.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

# Fitting these 15 features to the logit model from statsmodels library to check if all of them are statistically significant
X = X_train_selected
Y = y_train.reset_index()
Y = Y['target']
logit_model = sm.Logit(Y, X)
logit_result = logit_model.fit(method="nm", maxiter=5000)
print(logit_result.summary2())

# mean texture, mean concavity, mean concave points, worst perimeter, worst smoothness are statistically significant

X_train_selected_final = X_train[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
                                          'mean concavity']]

X_test_selected = X_test[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
                                          'mean concavity']]

X_val_selected = X_val[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
                                          'mean concavity']]

# Creating logistic model using sklearn based on the selected variables:
model = LogisticRegression(solver='liblinear', max_iter=500, penalty = 'l1')
model.fit(X_train_selected_final, Y)
coefficients = model.coef_

y_pred = model.predict(X_train_selected)
f1_in_sample = f1_score(y_train, y_pred)
print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975

y_pred = model.predict(X_test_selected)
f1_out_sample = f1_score(y_test, y_pred)
print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out-of-sample F1 score = 0.959

# Hyperparameter tunning with Optuna:

def logit_objective(trial: optuna.trial.Trial):
    C = trial.suggest_float("C", 0.001, 10)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    model = LogisticRegression(solver='liblinear', C=C, penalty=penalty)
    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_val_selected)
    f1 = f1_score(y_val, y_pred)

    return f1

sampler = optuna.samplers.TPESampler(seed=999)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(logit_objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Best trial:
#  Value:  0.9772727272727273
#  Params:
#    C: 8.0344769727568
#    penalty: l1

logistic_model = LogisticRegression(solver='liblinear', penalty= trial.params["penalty"], C = trial.params["C"], max_iter=10000, random_state=0)
logistic_model_fit = logistic_model.fit(X_train_selected, y_train)
y_prob = logistic_model_fit.predict_log_proba(X_test_selected)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


y_pred = logistic_model.predict(X_train_selected)
f1_in_sample = f1_score(y_train, y_pred)
print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975


y_pred = logistic_model.predict(X_test_selected)
f1_out_sample = f1_score(y_test, y_pred)
print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out of sample F1 score = 0.959

# Look like there is no improvement on both in sample and out of sample performance after hyperparameter tunning

############### ADA BOOST ###############

# Fiting AdaBoost model with default hyperparameter:

ada_model = AdaBoostClassifier(random_state=1)
ada_model.fit(X_train, y_train)

# In sample F1 score
y_pred = ada_model.predict(X_train)
print(f1_score(y_train, y_pred)) # f1_score = 1.0

# Out of sample F1 score
y_pred = ada_model.predict(X_test)
print(f1_score(y_test, y_pred)) # f1_score = 0.95238

# Hyperparameter tunning using Optuna:

def ADA_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)

sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(ADA_objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Best trial:
#  Value:  0.98876
#  Params:
#    n_estimators: 238
#    learning_rate: 0.2758347554916674

ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
ada_model.fit(X_train, y_train)

feature_importances = ada_model.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Feature Importance")
plt.title("Feature Importances in AdaBoost Model")
plt.show()

# mean perimeter, smoothness error, mean smoothness and mean radius has feature importance = 0!!

y_pred = ada_model.predict(X_test)

print(f1_score(y_test, y_pred)) # f1_score = 0.97260

# construct correlation matrix of the remaining features

X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius'])

f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

# radius error has almost perfect correlation with area error => drop radius error as it has lower feature importance score
# worst perimeter, worst area, mean area have almost perfect correlation with worst radius => drop those 3 as they have lower feature importance score

X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
                                                  'radius error', 'worst perimeter', 'worst area', 'mean area'])

X_test_selected_features = X_test.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
                                                  'radius error', 'worst perimeter', 'worst area', 'mean area'])

X_val_selected_features = X_val.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
                                                  'radius error', 'worst perimeter', 'worst area', 'mean area'])


    # Hyperparameter tunning on selected features:
def ADA_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
    model.fit(X_train_selected_features, y_train)
    y_pred = model.predict(X_val_selected_features)
    return f1_score(y_val, y_pred)

sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(ADA_objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Best trial:
#  Value:  0.98876 an imporovement in in-sample f1 score
#  Params:
#    n_estimators: 142
#    learning_rate: 0.5704727088203682

ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
ada_model.fit(X_train_selected_features, y_train)

feature_importances = ada_model.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
plt.xticks(range(X_train_selected_features.shape[1]), ada_model.feature_names_in_, rotation=90)
plt.xlabel("Feature")
plt.ylabel("Feature Importance")
plt.title("Feature Importances in ADA Boost Model")
plt.show()

y_pred = ada_model.predict(X_test_selected_features)

print(f1_score(y_test, y_pred)) # out-of-sample f1_score = 0.95890 which is worse than f1_score = 0.97260 before dropping features

############### GRADIENT BOOSTING ###############

# Fiting GBM model with default hyperparameter:

GB_model = GradientBoostingClassifier(random_state=1)
GB_model.fit(X_train, y_train)

# In sample F1 score
y_pred = GB_model.predict(X_train)
print(f1_score(y_train, y_pred)) # f1_score = 1.0

# Out of sample F1 score
y_pred = GB_model.predict(X_test)
print(f1_score(y_test, y_pred)) # f1_score = 0.9517

# Using Hyperparameter tunning from Optuna library

def GBM_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                                       max_depth=max_depth, subsample=subsample, random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred)

sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(GBM_objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Best trial:
#  Value:  1.0
#  Params:
#    n_estimators: 98
#    learning_rate: 0.5257150170904044
#    max_depth: 4
#    subsample: 0.9

GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
                                       learning_rate=trial.params["learning_rate"],
                                       max_depth = trial.params["max_depth"],
                                       subsample = trial.params["subsample"], random_state=1)
GB_model.fit(X_train, y_train)

# Visualize the feature importances in the model
feature_importances = GB_model.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Feature Importance")
plt.title("Feature Importances in Gradient Boosting Model")
plt.show()

# Based on the feature importance graph, worst perimeter, worse concave points, worst texture, mean concave points, concave points error and worst radius are the most important features
# Lets build a correlation matrix to see if among the top features, there exists any pair with high correlation

X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error', 'worst radius']]

f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
# worst radius has perfect correlation with worse perimeter and given worst perimeter has the highest importance value, we then drop worst radius
# Final selected variables: 'worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error'

# Before using the selected variables, we could check for out-of-sample result based on current model specification
y_pred = GB_model.predict(X_test)

print(f1_score(y_test, y_pred)) # f1_score = 0.9726

# Building GBM based on selected features:
    #Selected features based on spearman correlation analysis:
X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
X_test_selected_features = X_test[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
X_val_selected_features = X_val[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]

    # Hyperparameter tunning on selected features:
def GBM_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                                       max_depth=max_depth, subsample=subsample, random_state=1)
    model.fit(X_train_selected_features, y_train)
    y_pred = model.predict(X_val_selected_features)
    return f1_score(y_val, y_pred)

sampler = optuna.samplers.TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(GBM_objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# Best trial:
#  Value:  1.0
#  Params:
#    n_estimators: 359
#    learning_rate: 0.31906341975134206
#    max_depth: 2
#    subsample: 0.8

GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
                                       learning_rate=trial.params["learning_rate"],
                                       max_depth = trial.params["max_depth"],
                                       subsample = trial.params["subsample"], random_state=1)
GB_model.fit(X_train_selected_features, y_train)

feature_importances = GB_model.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
plt.xticks(range(X_train_selected_features.shape[1]), GB_model.feature_names_in_, rotation=90)
plt.xlabel("Feature")
plt.ylabel("Feature Importance")
plt.title("Feature Importances in Gradient Boosting Model")
plt.show()

y_pred = GB_model.predict(X_test_selected_features)

print(f1_score(y_test, y_pred)) # f1_score = 0.9796 => better than f1_score = 0.9726 (before dropping worst radius)

############# MODEL COMPARISON #############

# Comparison of performance is done by comparing out of sample F1-score before and after hyperparameter tunning:

    # Logistic model with default hyperparameter + feature selection: 0.952
    # Logistic model after hyperparameter tunning + feature selection: 0.952

    # AdaBoost model with default hyperparameter: 0.95238
    # AdaBoost model after hyperparameter tunning: 0.97260
    # AdaBoost model after hyperparameter tunning + feature selection: 0.9589

    # GB model with default hyperparameter: 0.9517
    # GB model after hyperparameter tunning: 0.9726
    # GB model after hyperparameter tunning + feature selection: 0.9796

# Both AdaBoost and GB perform better than Logistic Regression after hyperparameter tunning and feature selection
# which is evident in higher F1 score of both Boosting models than that of logistic regression

# hyperparameter tunning has positive impact on both AdaBoost and Gradient Boosting models as the F1 score increased
# slightly after tunning took place. On the other hand, hyperparameter has almost no impact on logistic model. Perhaps,
# logistic model has much simpler hyperparameters compared to the other

# Feature selection based on feature importance + spearman correlation has positive impact on Gradient Boosting while it
# has negative impact on AdaBoost and almost no impact on logistic model.