Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Sat Oct 28 23:46:48 2023
- @author: Lenovo
- """
- from sklearn.datasets import load_breast_cancer
- from sklearn.model_selection import train_test_split
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import seaborn as sns
- from sklearn.ensemble import RandomForestClassifier as rfc
- from sklearn.feature_selection import RFECV
- from sklearn.preprocessing import StandardScaler
- from collections import Counter
- from sklearn.feature_selection import RFE
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import StratifiedKFold
- import statsmodels.api as sm
- from mlxtend.feature_selection import SequentialFeatureSelector
- from sklearn_genetic import GASearchCV
- from sklearn.ensemble import AdaBoostClassifier
- from sklearn.ensemble import GradientBoostingClassifier
- from skopt import BayesSearchCV
- import optuna
- from sklearn.feature_selection import SelectFromModel
- from sklearn.metrics import roc_auc_score
- from sklearn.metrics import f1_score
- from sklearn.metrics import roc_curve
- from sklearn.metrics import precision_score
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import precision_recall_curve
- precision_recall_curve
- # Load breast cancer data
- data = load_breast_cancer()
- feature_names = data.feature_names
- print(data.DESCR)
- df = pd.DataFrame(data.data, columns=data.feature_names)
- df.isnull().sum()
- df.isna().sum()
- df['target'] = data.target
- df['target'].value_counts()[1] # number of target = 357
- df['target'].value_counts()[0] # number of non target = 212
- # Splitting the training vs. testing data based on 30-70% ratio
- X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=1)
- X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)
- # Feature scaling:
- sc = StandardScaler()
- X_train = sc.fit_transform(X_train)
- X_val = sc.fit_transform(X_val)
- X_test = sc.transform(X_test)
- X_train = pd.DataFrame(X_train, columns=df.drop(columns=['target']).columns)
- X_val = pd.DataFrame(X_val, columns=df.drop(columns=['target']).columns)
- X_test = pd.DataFrame(X_test, columns=df.drop(columns=['target']).columns)
- f,ax = plt.subplots(figsize=(18, 18))
- sns.heatmap(X_train.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
- # There are quite a lot of variables that are highly correlated => This suggests omission of some variables to avoid
- # instability of a model on testing dataset.
- # feature selection:
- # Using recursive feature elimination with random forest classifier to decide the max number of features for logistic regression
- number_of_random_states = 30
- average_optimal = np.zeros(30)
- i = 0
- for rs in range(number_of_random_states):
- rf_classifier = rfc(random_state = rs)
- rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring='f1')
- rfecv = rfecv.fit(X_train, y_train)
- average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
- i = i + 1
- print ('progress ' + str(round(i/number_of_random_states*100)) + '%')
- average_optimal /= number_of_random_states
- plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
- print("Number of features selected :", np.argmax(average_optimal)+1)
- print("Evaluation of the optimal f1 :", np.max(average_optimal))
- plt.show()
- # Number of features selected : 15
- # List of top 15 features chosen by REFECV
- most_appearing_features = []
- for rs in range(10):
- rf_classifier = rfc(random_state=rs)
- rfe = RFE(estimator=rf_classifier, n_features_to_select=15, step=1)
- rfe = rfe.fit(X_train, y_train)
- most_appearing_features.append(X_train.columns[rfe.support_].tolist())
- most_appearing_features = [item for sublist in most_appearing_features for item in sublist]
- print('Most appearing features :')
- Counter(most_appearing_features).most_common(15)
- # [('mean radius', 10),
- # ('mean texture', 10),
- # ('mean area', 10),
- # ('mean concavity', 10),
- # ('mean concave points', 10),
- # ('area error', 10),
- # ('worst radius', 10),
- # ('worst texture', 10),
- # ('worst perimeter', 10),
- # ('worst area', 10),
- # ('worst smoothness', 10),
- # ('worst concavity', 10),
- # ('worst concave points', 10),
- # ('mean perimeter', 9),
- # ('worst compactness', 9)]
- # Construct correlation matrix of the top 15 features:
- X_train_selected = X_train[['mean radius', 'mean texture', 'mean area', 'mean concavity', 'mean concave points',
- 'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
- 'worst concavity', 'worst concave points', 'mean perimeter', 'worst compactness']]
- f,ax = plt.subplots(figsize=(18, 18))
- sns.heatmap(X_train_selected.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
- # Fitting these 15 features to the logit model from statsmodels library to check if all of them are statistically significant
- X = X_train_selected
- Y = y_train.reset_index()
- Y = Y['target']
- logit_model = sm.Logit(Y, X)
- logit_result = logit_model.fit(method="nm", maxiter=5000)
- print(logit_result.summary2())
- # mean texture, mean concavity, mean concave points, worst perimeter, worst smoothness are statistically significant
- X_train_selected_final = X_train[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
- 'mean concavity']]
- X_test_selected = X_test[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
- 'mean concavity']]
- X_val_selected = X_val[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
- 'mean concavity']]
- # Creating logistic model using sklearn based on the selected variables:
- model = LogisticRegression(solver='liblinear', max_iter=500, penalty = 'l1')
- model.fit(X_train_selected_final, Y)
- coefficients = model.coef_
- y_pred = model.predict(X_train_selected)
- f1_in_sample = f1_score(y_train, y_pred)
- print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975
- y_pred = model.predict(X_test_selected)
- f1_out_sample = f1_score(y_test, y_pred)
- print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out-of-sample F1 score = 0.959
- # Hyperparameter tunning with Optuna:
- def logit_objective(trial: optuna.trial.Trial):
- C = trial.suggest_float("C", 0.001, 10)
- penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
- model = LogisticRegression(solver='liblinear', C=C, penalty=penalty)
- model.fit(X_train_selected, y_train)
- y_pred = model.predict(X_val_selected)
- f1 = f1_score(y_val, y_pred)
- return f1
- sampler = optuna.samplers.TPESampler(seed=999)
- study = optuna.create_study(direction="maximize", sampler=sampler)
- study.optimize(logit_objective, n_trials=100)
- print("Number of finished trials: ", len(study.trials))
- print("Best trial:")
- trial = study.best_trial
- print(" Value: ", trial.value)
- print(" Params: ")
- for key, value in trial.params.items():
- print(" {}: {}".format(key, value))
- # Best trial:
- # Value: 0.9772727272727273
- # Params:
- # C: 8.0344769727568
- # penalty: l1
- logistic_model = LogisticRegression(solver='liblinear', penalty= trial.params["penalty"], C = trial.params["C"], max_iter=10000, random_state=0)
- logistic_model_fit = logistic_model.fit(X_train_selected, y_train)
- y_prob = logistic_model_fit.predict_log_proba(X_test_selected)[:, 1]
- precision, recall, _ = precision_recall_curve(y_test, y_prob)
- plt.figure(figsize=(8, 6))
- plt.plot(recall, precision, marker='.')
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- plt.title('Precision-Recall Curve')
- plt.show()
- y_pred = logistic_model.predict(X_train_selected)
- f1_in_sample = f1_score(y_train, y_pred)
- print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975
- y_pred = logistic_model.predict(X_test_selected)
- f1_out_sample = f1_score(y_test, y_pred)
- print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out of sample F1 score = 0.959
- # Look like there is no improvement on both in sample and out of sample performance after hyperparameter tunning
- ############### ADA BOOST ###############
- # Fiting AdaBoost model with default hyperparameter:
- ada_model = AdaBoostClassifier(random_state=1)
- ada_model.fit(X_train, y_train)
- # In sample F1 score
- y_pred = ada_model.predict(X_train)
- print(f1_score(y_train, y_pred)) # f1_score = 1.0
- # Out of sample F1 score
- y_pred = ada_model.predict(X_test)
- print(f1_score(y_test, y_pred)) # f1_score = 0.95238
- # Hyperparameter tunning using Optuna:
- def ADA_objective(trial):
- n_estimators = trial.suggest_int("n_estimators", 50, 500)
- learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
- model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
- model.fit(X_train, y_train)
- y_pred = model.predict(X_val)
- return f1_score(y_val, y_pred)
- sampler = optuna.samplers.TPESampler(seed=1)
- study = optuna.create_study(direction="maximize", sampler=sampler)
- study.optimize(ADA_objective, n_trials=100)
- print("Number of finished trials: ", len(study.trials))
- print("Best trial:")
- trial = study.best_trial
- print(" Value: ", trial.value)
- print(" Params: ")
- for key, value in trial.params.items():
- print(" {}: {}".format(key, value))
- # Best trial:
- # Value: 0.98876
- # Params:
- # n_estimators: 238
- # learning_rate: 0.2758347554916674
- ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
- ada_model.fit(X_train, y_train)
- feature_importances = ada_model.feature_importances_
- sorted_idx = feature_importances.argsort()[::-1]
- plt.figure(figsize=(10, 6))
- plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
- plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
- plt.xlabel("Feature")
- plt.ylabel("Feature Importance")
- plt.title("Feature Importances in AdaBoost Model")
- plt.show()
- # mean perimeter, smoothness error, mean smoothness and mean radius has feature importance = 0!!
- y_pred = ada_model.predict(X_test)
- print(f1_score(y_test, y_pred)) # f1_score = 0.97260
- # construct correlation matrix of the remaining features
- X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius'])
- f,ax = plt.subplots(figsize=(18, 18))
- sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
- # radius error has almost perfect correlation with area error => drop radius error as it has lower feature importance score
- # worst perimeter, worst area, mean area have almost perfect correlation with worst radius => drop those 3 as they have lower feature importance score
- X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
- 'radius error', 'worst perimeter', 'worst area', 'mean area'])
- X_test_selected_features = X_test.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
- 'radius error', 'worst perimeter', 'worst area', 'mean area'])
- X_val_selected_features = X_val.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
- 'radius error', 'worst perimeter', 'worst area', 'mean area'])
- # Hyperparameter tunning on selected features:
- def ADA_objective(trial):
- n_estimators = trial.suggest_int("n_estimators", 50, 500)
- learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
- model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
- model.fit(X_train_selected_features, y_train)
- y_pred = model.predict(X_val_selected_features)
- return f1_score(y_val, y_pred)
- sampler = optuna.samplers.TPESampler(seed=1)
- study = optuna.create_study(direction="maximize", sampler=sampler)
- study.optimize(ADA_objective, n_trials=100)
- print("Number of finished trials: ", len(study.trials))
- print("Best trial:")
- trial = study.best_trial
- print(" Value: ", trial.value)
- print(" Params: ")
- for key, value in trial.params.items():
- print(" {}: {}".format(key, value))
- # Best trial:
- # Value: 0.98876 an imporovement in in-sample f1 score
- # Params:
- # n_estimators: 142
- # learning_rate: 0.5704727088203682
- ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
- ada_model.fit(X_train_selected_features, y_train)
- feature_importances = ada_model.feature_importances_
- sorted_idx = feature_importances.argsort()[::-1]
- plt.figure(figsize=(10, 6))
- plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
- plt.xticks(range(X_train_selected_features.shape[1]), ada_model.feature_names_in_, rotation=90)
- plt.xlabel("Feature")
- plt.ylabel("Feature Importance")
- plt.title("Feature Importances in ADA Boost Model")
- plt.show()
- y_pred = ada_model.predict(X_test_selected_features)
- print(f1_score(y_test, y_pred)) # out-of-sample f1_score = 0.95890 which is worse than f1_score = 0.97260 before dropping features
- ############### GRADIENT BOOSTING ###############
- # Fiting GBM model with default hyperparameter:
- GB_model = GradientBoostingClassifier(random_state=1)
- GB_model.fit(X_train, y_train)
- # In sample F1 score
- y_pred = GB_model.predict(X_train)
- print(f1_score(y_train, y_pred)) # f1_score = 1.0
- # Out of sample F1 score
- y_pred = GB_model.predict(X_test)
- print(f1_score(y_test, y_pred)) # f1_score = 0.9517
- # Using Hyperparameter tunning from Optuna library
- def GBM_objective(trial):
- n_estimators = trial.suggest_int("n_estimators", 50, 500)
- learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
- max_depth = trial.suggest_int("max_depth", 2, 10)
- subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
- model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
- max_depth=max_depth, subsample=subsample, random_state=1)
- model.fit(X_train, y_train)
- y_pred = model.predict(X_val)
- return f1_score(y_val, y_pred)
- sampler = optuna.samplers.TPESampler(seed=1)
- study = optuna.create_study(direction="maximize", sampler=sampler)
- study.optimize(GBM_objective, n_trials=100)
- print("Number of finished trials: ", len(study.trials))
- print("Best trial:")
- trial = study.best_trial
- print(" Value: ", trial.value)
- print(" Params: ")
- for key, value in trial.params.items():
- print(" {}: {}".format(key, value))
- # Best trial:
- # Value: 1.0
- # Params:
- # n_estimators: 98
- # learning_rate: 0.5257150170904044
- # max_depth: 4
- # subsample: 0.9
- GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
- learning_rate=trial.params["learning_rate"],
- max_depth = trial.params["max_depth"],
- subsample = trial.params["subsample"], random_state=1)
- GB_model.fit(X_train, y_train)
- # Visualize the feature importances in the model
- feature_importances = GB_model.feature_importances_
- sorted_idx = feature_importances.argsort()[::-1]
- plt.figure(figsize=(10, 6))
- plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
- plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
- plt.xlabel("Feature")
- plt.ylabel("Feature Importance")
- plt.title("Feature Importances in Gradient Boosting Model")
- plt.show()
- # Based on the feature importance graph, worst perimeter, worse concave points, worst texture, mean concave points, concave points error and worst radius are the most important features
- # Lets build a correlation matrix to see if among the top features, there exists any pair with high correlation
- X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error', 'worst radius']]
- f,ax = plt.subplots(figsize=(18, 18))
- sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
- # worst radius has perfect correlation with worse perimeter and given worst perimeter has the highest importance value, we then drop worst radius
- # Final selected variables: 'worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error'
- # Before using the selected variables, we could check for out-of-sample result based on current model specification
- y_pred = GB_model.predict(X_test)
- print(f1_score(y_test, y_pred)) # f1_score = 0.9726
- # Building GBM based on selected features:
- #Selected features based on spearman correlation analysis:
- X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
- X_test_selected_features = X_test[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
- X_val_selected_features = X_val[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
- # Hyperparameter tunning on selected features:
- def GBM_objective(trial):
- n_estimators = trial.suggest_int("n_estimators", 50, 500)
- learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
- max_depth = trial.suggest_int("max_depth", 2, 10)
- subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
- model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
- max_depth=max_depth, subsample=subsample, random_state=1)
- model.fit(X_train_selected_features, y_train)
- y_pred = model.predict(X_val_selected_features)
- return f1_score(y_val, y_pred)
- sampler = optuna.samplers.TPESampler(seed=1)
- study = optuna.create_study(direction="maximize", sampler=sampler)
- study.optimize(GBM_objective, n_trials=100)
- print("Number of finished trials: ", len(study.trials))
- print("Best trial:")
- trial = study.best_trial
- print(" Value: ", trial.value)
- print(" Params: ")
- for key, value in trial.params.items():
- print(" {}: {}".format(key, value))
- # Best trial:
- # Value: 1.0
- # Params:
- # n_estimators: 359
- # learning_rate: 0.31906341975134206
- # max_depth: 2
- # subsample: 0.8
- GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
- learning_rate=trial.params["learning_rate"],
- max_depth = trial.params["max_depth"],
- subsample = trial.params["subsample"], random_state=1)
- GB_model.fit(X_train_selected_features, y_train)
- feature_importances = GB_model.feature_importances_
- sorted_idx = feature_importances.argsort()[::-1]
- plt.figure(figsize=(10, 6))
- plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
- plt.xticks(range(X_train_selected_features.shape[1]), GB_model.feature_names_in_, rotation=90)
- plt.xlabel("Feature")
- plt.ylabel("Feature Importance")
- plt.title("Feature Importances in Gradient Boosting Model")
- plt.show()
- y_pred = GB_model.predict(X_test_selected_features)
- print(f1_score(y_test, y_pred)) # f1_score = 0.9796 => better than f1_score = 0.9726 (before dropping worst radius)
- ############# MODEL COMPARISON #############
- # Comparison of performance is done by comparing out of sample F1-score before and after hyperparameter tunning:
- # Logistic model with default hyperparameter + feature selection: 0.952
- # Logistic model after hyperparameter tunning + feature selection: 0.952
- # AdaBoost model with default hyperparameter: 0.95238
- # AdaBoost model after hyperparameter tunning: 0.97260
- # AdaBoost model after hyperparameter tunning + feature selection: 0.9589
- # GB model with default hyperparameter: 0.9517
- # GB model after hyperparameter tunning: 0.9726
- # GB model after hyperparameter tunning + feature selection: 0.9796
- # Both AdaBoost and GB perform better than Logistic Regression after hyperparameter tunning and feature selection
- # which is evident in higher F1 score of both Boosting models than that of logistic regression
- # hyperparameter tunning has positive impact on both AdaBoost and Gradient Boosting models as the F1 score increased
- # slightly after tunning took place. On the other hand, hyperparameter has almost no impact on logistic model. Perhaps,
- # logistic model has much simpler hyperparameters compared to the other
- # Feature selection based on feature importance + spearman correlation has positive impact on Gradient Boosting while it
- # has negative impact on AdaBoost and almost no impact on logistic model.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement