Advertisement
Nam_Hoang_Waw

Machine Learning II - exercise 3 - GBM_XGBM_LightGBM_CatBoost_Final

Nov 9th, 2023
745
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.80 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Nov  8 19:20:49 2023
  4.  
  5. @author: Lenovo
  6. """
  7.  
  8. from ucimlrepo import fetch_ucirepo
  9. import matplotlib.pyplot as plt
  10. import warnings
  11. from sklearn.model_selection import train_test_split
  12. import pandas as pd
  13. import seaborn as sns
  14. from sklearn.preprocessing import StandardScaler
  15. from sklearn.feature_selection import mutual_info_classif
  16. import numpy as np
  17. from skfeature.function.similarity_based import fisher_score
  18. from  sklearn.ensemble import RandomForestClassifier as rfc
  19. from sklearn.feature_selection import RFECV
  20. import sklearn
  21. from sklearn.feature_selection import RFE
  22. from collections import Counter
  23. from catboost import CatBoostClassifier
  24. from sklearn.pipeline import make_pipeline
  25. from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
  26. from xgboost import XGBClassifier
  27. from lightgbm import LGBMClassifier
  28. from sklearn.model_selection import cross_val_score
  29. from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
  30. import optuna
  31. import functools
  32. from imblearn.over_sampling import SMOTE
  33.  
  34.  
  35.  
  36. wine_quality = fetch_ucirepo(id=186)
  37.  
  38. # data (as pandas dataframes)
  39. x = wine_quality.data.features
  40. y = wine_quality.data.targets
  41.  
  42. # Checking for null values
  43. x.isnull().values.any()
  44. y.isnull().values.any()
  45.  
  46. y.hist(bins=25,figsize=(10,10))
  47. # Look like we have only observation starting from 3
  48. # There are not many observations for quality 3 and 9.
  49.  
  50. df = pd.concat([x, y], axis=1)
  51.  
  52.  
  53. # Let assume that wine with quality > 7 is regarded as high quality and anything between 5 and 6 is regarded as medium quality and below 5 is regarded as low quality
  54. # We then group the target data into 3 groups:
  55.    
  56. df['quality_class'] = 0  # Initialize all values to 1 corresponding to low quality
  57.  
  58. # Let group wine quality into 3 groups: high, medium and low as follows:
  59. df.loc[df['quality'] > 7, 'quality_class'] = 2 # high quality
  60. df.loc[(df['quality'] >= 5) & (df['quality'] <= 6), 'quality_class'] = 1 # medium quality
  61. df['quality_class'].unique()
  62.  
  63. df.drop('quality', inplace = True, axis = 1)
  64.  
  65. # we split data
  66. X_train, X_test, y_train, y_test = train_test_split(df.drop(['quality_class'], axis = 1), df['quality_class'], test_size=0.2, random_state=1)
  67.  
  68. # Due to imbalanced dataset, we need to perform resampling in order to make sure that the representativeness of 3 class is equal
  69.  
  70. oversample = SMOTE()
  71. X_train, y_train = oversample.fit_resample(X_train, y_train)
  72.  
  73. sns.countplot(x=y_train)
  74. plt.show()
  75.  
  76. warnings.filterwarnings('ignore')
  77. fig,ax = plt.subplots(12,2,figsize=(13,40))
  78. for index,i in enumerate(df.drop(['quality_class'],axis = 1).columns):
  79.     sns.distplot(df[i],ax=ax[index,0])
  80.     sns.boxplot(df[i],ax=ax[index,1], linewidth = 1.2)
  81.     plt.grid()
  82. fig.tight_layout()
  83. fig.subplots_adjust(top=0.95)
  84.  
  85. plt.suptitle("Visualization of Numeric Features",fontsize=20)
  86.  
  87. sns.pairplot(df,hue = 'quality_class', corner=True)
  88.  
  89. sns.heatmap(df.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
  90. # alcohol and density have the highest correlation. This suggests that we might need to exclude 1 variable after feature selection
  91.  
  92. sc = StandardScaler()
  93. X_train = sc.fit_transform(X_train)
  94. X_test = sc.transform(X_test)
  95.  
  96. uniform_samples = np.random.uniform(0, 1, 8781)
  97. X_train = pd.DataFrame(X_train, columns=df.drop(columns=['quality_class']).columns)
  98. X_train['uniform_dist_var'] = uniform_samples
  99. X_test = pd.DataFrame(X_test, columns=df.drop(columns=['quality_class']).columns)
  100.  
  101. y_train = y_train.reset_index()
  102. y_train = y_train['quality_class']
  103. y_test = y_test.reset_index()
  104. y_test = y_test['quality_class']
  105. df_train = pd.concat([X_train, y_train], axis=1)
  106.  
  107. # Checking distribution of all features
  108. X_train.hist(bins=25,figsize=(10,10))
  109. plt.show()
  110.  
  111. # Checking distribution of dependent variable
  112.  
  113. x_vars = df_train.drop(columns=['quality_class'], axis=1).columns
  114.  
  115. sns.heatmap(df_train.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
  116.  
  117. # Feature Selection:
  118.    
  119.     # Information Gain:
  120.        
  121. ig = mutual_info_classif(X_train, y_train)
  122. feature_scores = {}
  123. j = 0
  124. for i in x_vars:
  125.     feature_scores[i] = ig[j]
  126.     j=j+1
  127. # Sort the features by importance score in descending order
  128. sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
  129.  
  130. for feature, score in sorted_features:
  131.     print('Feature:', feature, 'Score:', score)
  132.    
  133. # Plot a horizontal bar chart of the feature importance scores
  134. fig, ax = plt.subplots()
  135. y_pos = np.arange(len(sorted_features))
  136. ax.barh(y_pos, [score for feature, score in sorted_features], align="center")
  137. ax.set_yticks(y_pos)
  138. ax.set_yticklabels([feature for feature, score in sorted_features])
  139. ax.invert_yaxis()  # Labels read top-to-bottom
  140. ax.set_xlabel("Importance Score")
  141. ax.set_title("Feature Importance Scores (Information Gain)")
  142.  
  143. # Add importance scores as labels on the horizontal bar chart
  144. for i, v in enumerate([score for feature, score in sorted_features]):
  145.     ax.text(v + 0.01, i, str(round(v, 3)), color="black", fontweight="bold")
  146. plt.show()
  147.  
  148.     # Fisher score:    
  149. rank = fisher_score.fisher_score(X_train.to_numpy(), y_train.to_numpy(), mode='rank')
  150.  
  151. # Plotting the ranks
  152. features_rank = {}
  153. j = 0
  154. for i in x_vars:
  155.     features_rank[i] = rank[j]
  156.     j=j+1
  157. # Sort the features by importance score in descending order
  158. features_rank_descending = sorted(features_rank.items(), key=lambda x: x[1], reverse=True)
  159.  
  160. features_rank_descending
  161.  
  162. # Rank (the higher value, the more important the variable):
  163.        
  164. # [('total_sulfur_dioxide', 11),
  165. # ('alcohol', 10),
  166. # ('sulphates', 9),
  167. # ('free_sulfur_dioxide', 8),
  168. # ('citric_acid', 7),
  169. # ('density', 6),
  170. # ('residual_sugar', 5),
  171. # ('volatile_acidity', 4),
  172. # ('chlorides', 3),
  173. # ('pH', 2),
  174. # ('fixed_acidity', 1),
  175. # ('uniform_dist_var', 0)]
  176.  
  177.  
  178.     # Feature selection using Random Forest:
  179.        
  180. number_of_random_states = 30
  181. average_optimal = np.zeros(12)
  182. scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
  183.  
  184. i = 0
  185. for rs in range(number_of_random_states):
  186.     rf_classifier = rfc(random_state = rs)
  187.     rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring=scorer)
  188.     rfecv = rfecv.fit(X_train, y_train)
  189.     average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
  190.     i = i + 1
  191.     print ('progress ' + str(round(i/number_of_random_states*100)) + '%')
  192.    
  193. average_optimal /= number_of_random_states    
  194. plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
  195. print("Number of features selected :", np.argmax(average_optimal)+1)
  196. print("Evaluation of the optimal f1 :", np.max(average_optimal))
  197. plt.show()
  198.  
  199. # out of 12 variables, only 11 improve the model performance. The last variable makes the performance deteriorates.
  200. # I suspect it is the uniformly distributed variable (noise). Let's check if that variable appears in the top 11 variables.
  201.  
  202. most_appearing_features = []
  203.  
  204. for rs in range(10):
  205.     rf_classifier = rfc(random_state=rs)      
  206.     rfe = RFE(estimator=rf_classifier, n_features_to_select=11, step=1)
  207.     rfe = rfe.fit(X_train, y_train)
  208.     most_appearing_features.append(X_train.columns[rfe.support_].tolist())
  209. most_appearing_features = [item for sublist in most_appearing_features for item in sublist]
  210.  
  211. print('Most appearing features :')
  212. Counter(most_appearing_features).most_common(11)
  213.  
  214. # [('volatile_acidity', 10),
  215. # ('residual_sugar', 10),
  216. # ('chlorides', 10),
  217. # ('free_sulfur_dioxide', 10),
  218. # ('total_sulfur_dioxide', 10),
  219. # ('density', 10),
  220. # ('pH', 10),
  221. # ('sulphates', 10),
  222. # ('alcohol', 10)]
  223.  
  224. # Looks like we need to exclude uniformly distributed variable as it does not show up in the list of top 11 most appearing variables
  225. # All of the initial variables outperform noise variable in term of feature importance. As a result, we remove dummy noise variable
  226.  
  227. X_train.drop(['uniform_dist_var'], inplace = True, axis = 1)
  228.  
  229. ##### Fitting data to different boosting models using hyperparameter tunning #####
  230.  
  231. # GBM:
  232.  
  233. def objective_gbc(trial, X, y):
  234.     params = {'max_depth': trial.suggest_int('max_depth', 1, 10),
  235.               'n_estimators': trial.suggest_int('n_estimators', 10, 500),
  236.               'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e+0),
  237.               'random_state': trial.suggest_int('random_state', 0, 0)
  238.              }
  239.     clf = make_pipeline(GradientBoostingClassifier(**params))
  240.     scores = cross_val_score(clf, X, y, cv=5)
  241.     return np.mean(scores)
  242.  
  243. # XGBoost:
  244.  
  245. def objective_xgbc(trial, X, y):
  246.     params = {'objective': trial.suggest_categorical('objective', ['multi:softmax']),
  247.                   'random_state': trial.suggest_int('random_state', 0, 0),
  248.                   'n_estimators': trial.suggest_int('n_estimators', 10, 500, step=10), # num_boosting_round
  249.                   'max_depth': trial.suggest_int('max_depth', 1, 9),
  250.                   'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
  251.                   'learning_rate': trial.suggest_categorical('learning_rate', [0.1, 0.05, 0.01]), # eta
  252.                   'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
  253.                   'min_split_loss': trial.suggest_float('min_split_loss', 1e-8, 1.0, log=True), # gamma
  254.                   'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']), # , 'gblinear'
  255.                   'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True), # lambda
  256.                   'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0)# alpha
  257.                  }
  258.     clf = make_pipeline(XGBClassifier(**params, use_label_encoder=False, num_class = 3))
  259.     scores = cross_val_score(clf, X, y, cv=5)
  260.     return np.mean(scores)
  261.  
  262. # CatBoost:
  263.  
  264. def objective_catboost(trial, X, y):
  265.     params = {
  266.         "iterations": 1000,
  267.         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  268.         "depth": trial.suggest_int("depth", 1, 10),
  269.         #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
  270.         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
  271.         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
  272.     }
  273.     clf = make_pipeline(CatBoostClassifier(**params, silent=True))
  274.     scores = cross_val_score(clf, X, y, cv=5)
  275.     return np.mean(scores)
  276.  
  277. # Light Gradient Boosting:
  278.    
  279. def objective_lgbc(trial, X, y):
  280.     params = {
  281.         "objective": "multiclass",
  282.         "metric": "multi_logloss",
  283.         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  284.         'max_depth': trial.suggest_int('max_depth', 1, 20),
  285.         "verbosity": -1,
  286.         "boosting_type": trial.suggest_categorical('boosting_type ', ['gbdt', 'dart']),
  287.         "num_class": 3,
  288.         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
  289.         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
  290.         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
  291.         "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
  292.         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
  293.         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
  294.         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
  295.     }
  296.     clf = make_pipeline(LGBMClassifier(**params))
  297.     scores = cross_val_score(clf, X, y, cv=5)
  298.     return np.mean(scores)
  299.  
  300. # Histogram Gradient Boosting:
  301.    
  302. def objective_hgbc(trial, X, y):
  303.     params = {
  304.         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  305.         "max_iter": 1000,
  306.         "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 1, 100),
  307.         'max_depth': trial.suggest_int('max_depth', 1, 9),
  308.         "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 1e3, log=True),
  309.         "max_bins": trial.suggest_int('max_bins', 2, 255)        
  310.     }
  311.     clf = make_pipeline(HistGradientBoostingClassifier(**params))
  312.     scores = cross_val_score(clf, X, y, cv=5)
  313.     return np.mean(scores)
  314.  
  315. def calculate_score(model, X_train, y_train, X_test, y_test):
  316.     model.fit(X_train, y_train)
  317.     y_pred = model.predict(X_test)
  318.     acc = accuracy_score(y_test, y_pred)
  319.     #auc = roc_auc_score(y_test, y_pred, multi_class= 'ovo')
  320.     f1 = f1_score(y_test, y_pred, average= 'weighted')
  321.     recall = recall_score(y_test, y_pred, average= 'weighted')
  322.     precision = precision_score(y_test, y_pred, average= 'weighted')
  323.    
  324.     return acc, f1, recall, precision
  325.  
  326. # Hyperparameter tunning:
  327. study_gbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  328. study_gbc.optimize(functools.partial(objective_gbc, X=X_train, y=y_train), n_trials=30)
  329.  
  330. study_xgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  331. study_xgbc.optimize(functools.partial(objective_xgbc, X=X_train, y=y_train), n_trials=30)
  332.  
  333. study_catboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  334. study_catboost.optimize(functools.partial(objective_catboost, X=X_train, y=y_train), n_trials=30)
  335.  
  336. study_lgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  337. study_lgbc.optimize(functools.partial(objective_lgbc, X=X_train, y=y_train), n_trials=30)
  338.  
  339. study_hgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  340. study_hgbc.optimize(functools.partial(objective_hgbc, X=X_train, y=y_train), n_trials=30)
  341.  
  342. # Creating Classifiers using tunned parameters obtained from previous step:
  343. gbc = GradientBoostingClassifier(**study_gbc.best_params)
  344. xgbc = XGBClassifier(**study_xgbc.best_params, use_label_encoder=False)
  345. catboost = CatBoostClassifier(**study_catboost.best_params)
  346. lgbc = LGBMClassifier(**study_lgbc.best_params)
  347. hgbc = HistGradientBoostingClassifier(**study_hgbc.best_params)
  348.  
  349. models = [gbc,xgbc,lgbc,gbc,hgbc]
  350. names = ['Gradient Boosting Classifier', 'XGBoost Classifier', 'CatBoost Classifier',
  351.          'Light GBM Classifier', 'Hist Gradient Boosting Classifier']
  352.  
  353. # Calculating various model performance's metrics
  354. scorelist = {}
  355.  
  356.  
  357. for name, classifier in zip(names, models):
  358.     acc, f1, recall, precision = calculate_score(classifier, X_train, y_train, X_test, y_test)
  359.     scorelist[name] = (acc, f1, recall, precision)
  360.  
  361. scorelist
  362.  
  363. # RESULT COMPARISON #
  364.  
  365. # 'Gradient Boosting Classifier': (acc = 0.8224489795918367, f1 =  0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
  366. # 'XGBoost Classifier': (acc = 0.8040816326530612, f1 = 0.8014447959294754, recall = 0.8040816326530612, precision = 0.8000441051063351))
  367. # 'CatBoost Classifier': (acc = 0.8142857142857143, f1 = 0.8073504595990978, recall = 0.8142857142857143, precision = 0.8035260754952066)
  368. # 'Light GBM Classifier': (acc = 0.8224489795918367, f1 = 0.8157758677913398, recall = 0.8224489795918367, precision = 0.812542143991651)
  369. # 'Hist Gradient Boosting Classifier': (acc = 0.8112244897959183, f1 = 0.806735193615864, recall = 0.8112244897959183, precision = 0.8035021325007075)
  370.  
  371. # Gradient Boosting Classifier and Light GBM Classifier have exactly the same result for some reasons
  372. # Best overall performer(s): Gradient Boosting and Light GBM, followed by CatBoost, Hist Gradient Boosting and XGBoost
  373. # Reason why XGBoost underperforms in this dataset perhaps because of regularization which reduces overfitting on training sample.
  374. # More specifically, perhaps because the testing data closely resemble training data, any algorithm which penalizes overfitting
  375. # might underperform compared to learning algorithm which has no or little regularization.
  376. # Catboost underperforms compared to other learning algorithms (except XGBoost) perhaps due to explanatory variables are continuous rather than categorical.
  377.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement