Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Wed Nov 8 19:20:49 2023
- @author: Lenovo
- """
- from ucimlrepo import fetch_ucirepo
- import matplotlib.pyplot as plt
- import warnings
- from sklearn.model_selection import train_test_split
- import pandas as pd
- import seaborn as sns
- from sklearn.preprocessing import StandardScaler
- from sklearn.feature_selection import mutual_info_classif
- import numpy as np
- from skfeature.function.similarity_based import fisher_score
- from sklearn.ensemble import RandomForestClassifier as rfc
- from sklearn.feature_selection import RFECV
- import sklearn
- from sklearn.feature_selection import RFE
- from collections import Counter
- from catboost import CatBoostClassifier
- from sklearn.pipeline import make_pipeline
- from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from sklearn.model_selection import cross_val_score
- from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
- import optuna
- import functools
- from imblearn.over_sampling import SMOTE
- wine_quality = fetch_ucirepo(id=186)
- # data (as pandas dataframes)
- x = wine_quality.data.features
- y = wine_quality.data.targets
- # Checking for null values
- x.isnull().values.any()
- y.isnull().values.any()
- y.hist(bins=25,figsize=(10,10))
- # Look like we have only observation starting from 3
- # There are not many observations for quality 3 and 9.
- df = pd.concat([x, y], axis=1)
- # Let assume that wine with quality > 7 is regarded as high quality and anything between 5 and 6 is regarded as medium quality and below 5 is regarded as low quality
- # We then group the target data into 3 groups:
- df['quality_class'] = 0 # Initialize all values to 1 corresponding to low quality
- # Let group wine quality into 3 groups: high, medium and low as follows:
- df.loc[df['quality'] > 7, 'quality_class'] = 2 # high quality
- df.loc[(df['quality'] >= 5) & (df['quality'] <= 6), 'quality_class'] = 1 # medium quality
- df['quality_class'].unique()
- df.drop('quality', inplace = True, axis = 1)
- # we split data
- X_train, X_test, y_train, y_test = train_test_split(df.drop(['quality_class'], axis = 1), df['quality_class'], test_size=0.2, random_state=1)
- # Due to imbalanced dataset, we need to perform resampling in order to make sure that the representativeness of 3 class is equal
- oversample = SMOTE()
- X_train, y_train = oversample.fit_resample(X_train, y_train)
- sns.countplot(x=y_train)
- plt.show()
- warnings.filterwarnings('ignore')
- fig,ax = plt.subplots(12,2,figsize=(13,40))
- for index,i in enumerate(df.drop(['quality_class'],axis = 1).columns):
- sns.distplot(df[i],ax=ax[index,0])
- sns.boxplot(df[i],ax=ax[index,1], linewidth = 1.2)
- plt.grid()
- fig.tight_layout()
- fig.subplots_adjust(top=0.95)
- plt.suptitle("Visualization of Numeric Features",fontsize=20)
- sns.pairplot(df,hue = 'quality_class', corner=True)
- sns.heatmap(df.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
- # alcohol and density have the highest correlation. This suggests that we might need to exclude 1 variable after feature selection
- sc = StandardScaler()
- X_train = sc.fit_transform(X_train)
- X_test = sc.transform(X_test)
- uniform_samples = np.random.uniform(0, 1, 8781)
- X_train = pd.DataFrame(X_train, columns=df.drop(columns=['quality_class']).columns)
- X_train['uniform_dist_var'] = uniform_samples
- X_test = pd.DataFrame(X_test, columns=df.drop(columns=['quality_class']).columns)
- y_train = y_train.reset_index()
- y_train = y_train['quality_class']
- y_test = y_test.reset_index()
- y_test = y_test['quality_class']
- df_train = pd.concat([X_train, y_train], axis=1)
- # Checking distribution of all features
- X_train.hist(bins=25,figsize=(10,10))
- plt.show()
- # Checking distribution of dependent variable
- x_vars = df_train.drop(columns=['quality_class'], axis=1).columns
- sns.heatmap(df_train.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
- # Feature Selection:
- # Information Gain:
- ig = mutual_info_classif(X_train, y_train)
- feature_scores = {}
- j = 0
- for i in x_vars:
- feature_scores[i] = ig[j]
- j=j+1
- # Sort the features by importance score in descending order
- sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
- for feature, score in sorted_features:
- print('Feature:', feature, 'Score:', score)
- # Plot a horizontal bar chart of the feature importance scores
- fig, ax = plt.subplots()
- y_pos = np.arange(len(sorted_features))
- ax.barh(y_pos, [score for feature, score in sorted_features], align="center")
- ax.set_yticks(y_pos)
- ax.set_yticklabels([feature for feature, score in sorted_features])
- ax.invert_yaxis() # Labels read top-to-bottom
- ax.set_xlabel("Importance Score")
- ax.set_title("Feature Importance Scores (Information Gain)")
- # Add importance scores as labels on the horizontal bar chart
- for i, v in enumerate([score for feature, score in sorted_features]):
- ax.text(v + 0.01, i, str(round(v, 3)), color="black", fontweight="bold")
- plt.show()
- # Fisher score:
- rank = fisher_score.fisher_score(X_train.to_numpy(), y_train.to_numpy(), mode='rank')
- # Plotting the ranks
- features_rank = {}
- j = 0
- for i in x_vars:
- features_rank[i] = rank[j]
- j=j+1
- # Sort the features by importance score in descending order
- features_rank_descending = sorted(features_rank.items(), key=lambda x: x[1], reverse=True)
- features_rank_descending
- # Rank (the higher value, the more important the variable):
- # [('total_sulfur_dioxide', 11),
- # ('alcohol', 10),
- # ('sulphates', 9),
- # ('free_sulfur_dioxide', 8),
- # ('citric_acid', 7),
- # ('density', 6),
- # ('residual_sugar', 5),
- # ('volatile_acidity', 4),
- # ('chlorides', 3),
- # ('pH', 2),
- # ('fixed_acidity', 1),
- # ('uniform_dist_var', 0)]
- # Feature selection using Random Forest:
- number_of_random_states = 30
- average_optimal = np.zeros(12)
- scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
- i = 0
- for rs in range(number_of_random_states):
- rf_classifier = rfc(random_state = rs)
- rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring=scorer)
- rfecv = rfecv.fit(X_train, y_train)
- average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
- i = i + 1
- print ('progress ' + str(round(i/number_of_random_states*100)) + '%')
- average_optimal /= number_of_random_states
- plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
- print("Number of features selected :", np.argmax(average_optimal)+1)
- print("Evaluation of the optimal f1 :", np.max(average_optimal))
- plt.show()
- # out of 12 variables, only 11 improve the model performance. The last variable makes the performance deteriorates.
- # I suspect it is the uniformly distributed variable (noise). Let's check if that variable appears in the top 11 variables.
- most_appearing_features = []
- for rs in range(10):
- rf_classifier = rfc(random_state=rs)
- rfe = RFE(estimator=rf_classifier, n_features_to_select=11, step=1)
- rfe = rfe.fit(X_train, y_train)
- most_appearing_features.append(X_train.columns[rfe.support_].tolist())
- most_appearing_features = [item for sublist in most_appearing_features for item in sublist]
- print('Most appearing features :')
- Counter(most_appearing_features).most_common(11)
- # [('volatile_acidity', 10),
- # ('residual_sugar', 10),
- # ('chlorides', 10),
- # ('free_sulfur_dioxide', 10),
- # ('total_sulfur_dioxide', 10),
- # ('density', 10),
- # ('pH', 10),
- # ('sulphates', 10),
- # ('alcohol', 10)]
- # Looks like we need to exclude uniformly distributed variable as it does not show up in the list of top 11 most appearing variables
- # All of the initial variables outperform noise variable in term of feature importance. As a result, we remove dummy noise variable
- X_train.drop(['uniform_dist_var'], inplace = True, axis = 1)
- ##### Fitting data to different boosting models using hyperparameter tunning #####
- # GBM:
- def objective_gbc(trial, X, y):
- params = {'max_depth': trial.suggest_int('max_depth', 1, 10),
- 'n_estimators': trial.suggest_int('n_estimators', 10, 500),
- 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e+0),
- 'random_state': trial.suggest_int('random_state', 0, 0)
- }
- clf = make_pipeline(GradientBoostingClassifier(**params))
- scores = cross_val_score(clf, X, y, cv=5)
- return np.mean(scores)
- # XGBoost:
- def objective_xgbc(trial, X, y):
- params = {'objective': trial.suggest_categorical('objective', ['multi:softmax']),
- 'random_state': trial.suggest_int('random_state', 0, 0),
- 'n_estimators': trial.suggest_int('n_estimators', 10, 500, step=10), # num_boosting_round
- 'max_depth': trial.suggest_int('max_depth', 1, 9),
- 'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
- 'learning_rate': trial.suggest_categorical('learning_rate', [0.1, 0.05, 0.01]), # eta
- 'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
- 'min_split_loss': trial.suggest_float('min_split_loss', 1e-8, 1.0, log=True), # gamma
- 'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']), # , 'gblinear'
- 'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True), # lambda
- 'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0)# alpha
- }
- clf = make_pipeline(XGBClassifier(**params, use_label_encoder=False, num_class = 3))
- scores = cross_val_score(clf, X, y, cv=5)
- return np.mean(scores)
- # CatBoost:
- def objective_catboost(trial, X, y):
- params = {
- "iterations": 1000,
- "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
- "depth": trial.suggest_int("depth", 1, 10),
- #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
- "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
- "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
- }
- clf = make_pipeline(CatBoostClassifier(**params, silent=True))
- scores = cross_val_score(clf, X, y, cv=5)
- return np.mean(scores)
- # Light Gradient Boosting:
- def objective_lgbc(trial, X, y):
- params = {
- "objective": "multiclass",
- "metric": "multi_logloss",
- "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
- 'max_depth': trial.suggest_int('max_depth', 1, 20),
- "verbosity": -1,
- "boosting_type": trial.suggest_categorical('boosting_type ', ['gbdt', 'dart']),
- "num_class": 3,
- "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
- "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
- "num_leaves": trial.suggest_int("num_leaves", 2, 256),
- "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
- "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
- "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
- "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
- }
- clf = make_pipeline(LGBMClassifier(**params))
- scores = cross_val_score(clf, X, y, cv=5)
- return np.mean(scores)
- # Histogram Gradient Boosting:
- def objective_hgbc(trial, X, y):
- params = {
- "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
- "max_iter": 1000,
- "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 1, 100),
- 'max_depth': trial.suggest_int('max_depth', 1, 9),
- "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 1e3, log=True),
- "max_bins": trial.suggest_int('max_bins', 2, 255)
- }
- clf = make_pipeline(HistGradientBoostingClassifier(**params))
- scores = cross_val_score(clf, X, y, cv=5)
- return np.mean(scores)
- def calculate_score(model, X_train, y_train, X_test, y_test):
- model.fit(X_train, y_train)
- y_pred = model.predict(X_test)
- acc = accuracy_score(y_test, y_pred)
- #auc = roc_auc_score(y_test, y_pred, multi_class= 'ovo')
- f1 = f1_score(y_test, y_pred, average= 'weighted')
- recall = recall_score(y_test, y_pred, average= 'weighted')
- precision = precision_score(y_test, y_pred, average= 'weighted')
- return acc, f1, recall, precision
- # Hyperparameter tunning:
- study_gbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
- study_gbc.optimize(functools.partial(objective_gbc, X=X_train, y=y_train), n_trials=30)
- study_xgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
- study_xgbc.optimize(functools.partial(objective_xgbc, X=X_train, y=y_train), n_trials=30)
- study_catboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
- study_catboost.optimize(functools.partial(objective_catboost, X=X_train, y=y_train), n_trials=30)
- study_lgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
- study_lgbc.optimize(functools.partial(objective_lgbc, X=X_train, y=y_train), n_trials=30)
- study_hgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
- study_hgbc.optimize(functools.partial(objective_hgbc, X=X_train, y=y_train), n_trials=30)
- # Creating Classifiers using tunned parameters obtained from previous step:
- gbc = GradientBoostingClassifier(**study_gbc.best_params)
- xgbc = XGBClassifier(**study_xgbc.best_params, use_label_encoder=False)
- catboost = CatBoostClassifier(**study_catboost.best_params)
- lgbc = LGBMClassifier(**study_lgbc.best_params)
- hgbc = HistGradientBoostingClassifier(**study_hgbc.best_params)
- models = [gbc,xgbc,lgbc,gbc,hgbc]
- names = ['Gradient Boosting Classifier', 'XGBoost Classifier', 'CatBoost Classifier',
- 'Light GBM Classifier', 'Hist Gradient Boosting Classifier']
- # Calculating various model performance's metrics
- scorelist = {}
- for name, classifier in zip(names, models):
- acc, f1, recall, precision = calculate_score(classifier, X_train, y_train, X_test, y_test)
- scorelist[name] = (acc, f1, recall, precision)
- scorelist
- # RESULT COMPARISON #
- # 'Gradient Boosting Classifier': (acc = 0.8224489795918367, f1 = 0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
- # 'XGBoost Classifier': (acc = 0.8040816326530612, f1 = 0.8014447959294754, recall = 0.8040816326530612, precision = 0.8000441051063351))
- # 'CatBoost Classifier': (acc = 0.8142857142857143, f1 = 0.8073504595990978, recall = 0.8142857142857143, precision = 0.8035260754952066)
- # 'Light GBM Classifier': (acc = 0.8224489795918367, f1 = 0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
- # 'Hist Gradient Boosting Classifier': (acc = 0.8112244897959183, f1 = 0.806735193615864, recall = 0.8112244897959183, precision = 0.8035021325007075)
- # Gradient Boosting Classifier and Light GBM Classifier have exactly the same result for some reasons
- # Best overall performer(s): Gradient Boosting and Light GBM, followed by CatBoost, Hist Gradient Boosting and XGBoost
- # Reason why XGBoost underperforms in this dataset perhaps because of regularization which reduces overfitting on training sample.
- # More specifically, perhaps because the testing data closely resemble training data, any algorithm which penalizes overfitting
- # might underperform compared to learning algorithm which has no or little regularization.
- # Catboost underperforms compared to other learning algorithms (except XGBoost) perhaps due to explanatory variables are continuous rather than categorical.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement