Advertisement
Nam_Hoang_Waw

Machine Learning II - exercise 3 - GBM_XGBM_LightGBM_CatBoost

Nov 7th, 2023
626
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.26 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Wed Nov  6 19:20:49 2023
  4.  
  5. @author: Lenovo
  6. """
  7.  
  8. from ucimlrepo import fetch_ucirepo
  9. import matplotlib.pyplot as plt
  10. import warnings
  11. from sklearn.model_selection import train_test_split
  12. import pandas as pd
  13. import seaborn as sns
  14. from sklearn.preprocessing import StandardScaler
  15. from sklearn.feature_selection import mutual_info_classif
  16. import numpy as np
  17. from skfeature.function.similarity_based import fisher_score
  18. from  sklearn.ensemble import RandomForestClassifier as rfc
  19. from sklearn.feature_selection import RFECV
  20. import sklearn
  21. from sklearn.feature_selection import RFE
  22. from collections import Counter
  23. import statsmodels.api as sm
  24. import scipy.stats as stats
  25. from sklearn.feature_selection import SelectFromModel
  26. from catboost import CatBoostClassifier
  27. from sklearn.pipeline import make_pipeline
  28. from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
  29. from xgboost import XGBClassifier
  30. from lightgbm import LGBMClassifier
  31. from sklearn.model_selection import cross_val_score
  32. from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, f1_score, recall_score, log_loss, average_precision_score
  33. import optuna
  34. import functools
  35. from imblearn.over_sampling import SMOTE
  36.  
  37.  
  38.  
  39. wine_quality = fetch_ucirepo(id=186)
  40.  
  41. # data (as pandas dataframes)
  42. x = wine_quality.data.features
  43. y = wine_quality.data.targets
  44.  
  45. # Checking for null values
  46. x.isnull().values.any()
  47. y.isnull().values.any()
  48.  
  49. y.hist(bins=25,figsize=(10,10))
  50. # Look like we have only observation starting from 3
  51. # There are not many observations for quality 3 and 9.
  52.  
  53. df = pd.concat([x, y], axis=1)
  54.  
  55.  
  56. # Let assume that wine with quality > 7 is regarded as high quality and anything between 5 and 6 is regarded as medium quality and below 5 is regarded as low quality
  57. # We then group the target data into 3 groups:
  58.    
  59. df['quality_class'] = 0  # Initialize all values to 1 corresponding to low quality
  60.  
  61. # Let group wine quality into 3 groups: high, medium and low as follows:
  62. df.loc[df['quality'] > 7, 'quality_class'] = 2 # high quality
  63. df.loc[(df['quality'] >= 5) & (df['quality'] <= 6), 'quality_class'] = 1 # medium quality
  64. df['quality_class'].unique()
  65.  
  66. df.drop('quality', inplace = True, axis = 1)
  67.  
  68. # we split data
  69. X_train, X_test, y_train, y_test = train_test_split(df.drop(['quality_class'], axis = 1), df['quality_class'], test_size=0.2, random_state=1)
  70.  
  71. # Due to imbalanced dataset, we need to perform resampling in order to make sure that the representativeness of 3 class is equal
  72.  
  73. oversample = SMOTE()
  74. X_train, y_train = oversample.fit_resample(X_train, y_train)
  75.  
  76. sns.countplot(x=y_train)
  77. plt.show()
  78.  
  79. warnings.filterwarnings('ignore')
  80. fig,ax = plt.subplots(12,2,figsize=(13,40))
  81. for index,i in enumerate(df.drop(['quality_class'],axis = 1).columns):
  82.     sns.distplot(df[i],ax=ax[index,0])
  83.     sns.boxplot(df[i],ax=ax[index,1], linewidth = 1.2)
  84.     plt.grid()
  85. fig.tight_layout()
  86. fig.subplots_adjust(top=0.95)
  87.  
  88. plt.suptitle("Visualization of Numeric Features",fontsize=20)
  89.  
  90. sns.pairplot(df,hue = 'quality_class', corner=True)
  91.  
  92. sns.heatmap(df.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
  93. # alcohol and density have the highest correlation. This suggests that we might need to exclude 1 variable after feature selection
  94.  
  95. sc = StandardScaler()
  96. X_train = sc.fit_transform(X_train)
  97. X_test = sc.transform(X_test)
  98.  
  99. uniform_samples = np.random.uniform(0, 1, 8781)
  100. X_train = pd.DataFrame(X_train, columns=df.drop(columns=['quality_class']).columns)
  101. X_train['uniform_dist_var'] = uniform_samples
  102. X_test = pd.DataFrame(X_test, columns=df.drop(columns=['quality_class']).columns)
  103.  
  104. y_train = y_train.reset_index()
  105. y_train = y_train['quality_class']
  106. y_test = y_test.reset_index()
  107. y_test = y_test['quality_class']
  108. df_train = pd.concat([X_train, y_train], axis=1)
  109.  
  110. # Checking distribution of all features
  111. X_train.hist(bins=25,figsize=(10,10))
  112. plt.show()
  113.  
  114. # Checking distribution of dependent variable
  115.  
  116. x_vars = df_train.drop(columns=['quality_class'], axis=1).columns
  117.  
  118. sns.heatmap(df_train.sort_values(by='quality_class',ascending=True).corr(method='spearman'),cmap='viridis',annot=True)
  119.  
  120. # Feature Selection:
  121.    
  122.     # Information Gain:
  123.        
  124. ig = mutual_info_classif(X_train, y_train)
  125. feature_scores = {}
  126. j = 0
  127. for i in x_vars:
  128.     feature_scores[i] = ig[j]
  129.     j=j+1
  130. # Sort the features by importance score in descending order
  131. sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
  132.  
  133. for feature, score in sorted_features:
  134.     print('Feature:', feature, 'Score:', score)
  135.    
  136. # Plot a horizontal bar chart of the feature importance scores
  137. fig, ax = plt.subplots()
  138. y_pos = np.arange(len(sorted_features))
  139. ax.barh(y_pos, [score for feature, score in sorted_features], align="center")
  140. ax.set_yticks(y_pos)
  141. ax.set_yticklabels([feature for feature, score in sorted_features])
  142. ax.invert_yaxis()  # Labels read top-to-bottom
  143. ax.set_xlabel("Importance Score")
  144. ax.set_title("Feature Importance Scores (Information Gain)")
  145.  
  146. # Add importance scores as labels on the horizontal bar chart
  147. for i, v in enumerate([score for feature, score in sorted_features]):
  148.     ax.text(v + 0.01, i, str(round(v, 3)), color="black", fontweight="bold")
  149. plt.show()
  150.  
  151.     # Fisher score:    
  152. rank = fisher_score.fisher_score(X_train.to_numpy(), y_train.to_numpy(), mode='rank')
  153.  
  154. # Plotting the ranks
  155. features_rank = {}
  156. j = 0
  157. for i in x_vars:
  158.     features_rank[i] = rank[j]
  159.     j=j+1
  160. # Sort the features by importance score in descending order
  161. features_rank_descending = sorted(features_rank.items(), key=lambda x: x[1], reverse=True)
  162.  
  163. features_rank_descending
  164.  
  165. # Rank (the higher value, the more important the variable):
  166.        
  167. # [('total_sulfur_dioxide', 11),
  168. # ('alcohol', 10),
  169. # ('sulphates', 9),
  170. # ('free_sulfur_dioxide', 8),
  171. # ('citric_acid', 7),
  172. # ('density', 6),
  173. # ('residual_sugar', 5),
  174. # ('volatile_acidity', 4),
  175. # ('chlorides', 3),
  176. # ('pH', 2),
  177. # ('fixed_acidity', 1),
  178. # ('uniform_dist_var', 0)]
  179.  
  180.  
  181.     # Feature selection using Random Forest:
  182.        
  183. number_of_random_states = 30
  184. average_optimal = np.zeros(12)
  185. scorer = sklearn.metrics.make_scorer(sklearn.metrics.f1_score, average = 'weighted')
  186.  
  187. i = 0
  188. for rs in range(number_of_random_states):
  189.     rf_classifier = rfc(random_state = rs)
  190.     rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring=scorer)
  191.     rfecv = rfecv.fit(X_train, y_train)
  192.     average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
  193.     i = i + 1
  194.     print ('progress ' + str(round(i/number_of_random_states*100)) + '%')
  195.    
  196. average_optimal /= number_of_random_states    
  197. plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
  198. print("Number of features selected :", np.argmax(average_optimal)+1)
  199. print("Evaluation of the optimal f1 :", np.max(average_optimal))
  200. plt.show()
  201.  
  202. # out of 12 variables, only 11 improve the model performance. The last variable makes the performance deteriorates.
  203. # I suspect it is the uniformly distributed variable (noise). Let's check if that variable appears in the top 11 variables.
  204.  
  205. most_appearing_features = []
  206.  
  207. for rs in range(10):
  208.     rf_classifier = rfc(random_state=rs)      
  209.     rfe = RFE(estimator=rf_classifier, n_features_to_select=11, step=1)
  210.     rfe = rfe.fit(X_train, y_train)
  211.     most_appearing_features.append(X_train.columns[rfe.support_].tolist())
  212. most_appearing_features = [item for sublist in most_appearing_features for item in sublist]
  213.  
  214. print('Most appearing features :')
  215. Counter(most_appearing_features).most_common(11)
  216.  
  217. # [('volatile_acidity', 10),
  218. # ('residual_sugar', 10),
  219. # ('chlorides', 10),
  220. # ('free_sulfur_dioxide', 10),
  221. # ('total_sulfur_dioxide', 10),
  222. # ('density', 10),
  223. # ('pH', 10),
  224. # ('sulphates', 10),
  225. # ('alcohol', 10)]
  226.  
  227. # Looks like we need to exclude uniformly distributed variable as it does not show up in the list of top 11 most appearing variables
  228. # All of the initial variables outperform noise variable in term of feature importance. As a result, we remove dummy noise variable
  229.  
  230. X_train.drop(['uniform_dist_var'], inplace = True, axis = 1)
  231.  
  232. ##### Fitting data to different boosting models #####
  233.  
  234. # GBM:
  235.  
  236. def objective_gbc(trial, X, y):
  237.     params = {'max_depth': trial.suggest_int('max_depth', 1, 10),
  238.               'n_estimators': trial.suggest_int('n_estimators', 10, 500),
  239.               'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e+0),
  240.               'random_state': trial.suggest_int('random_state', 0, 0)
  241.              }
  242.     clf = make_pipeline(GradientBoostingClassifier(**params))
  243.     scores = cross_val_score(clf, X, y, cv=5)
  244.     return np.mean(scores)
  245.  
  246. # XGBoost:
  247.  
  248. def objective_xgbc(trial, X, y):
  249.     params = {'objective': trial.suggest_categorical('objective', ['multi:softmax']),
  250.                   'random_state': trial.suggest_int('random_state', 0, 0),
  251.                   'n_estimators': trial.suggest_int('n_estimators', 10, 500, step=10), # num_boosting_round
  252.                   'max_depth': trial.suggest_int('max_depth', 1, 9),
  253.                   'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
  254.                   'learning_rate': trial.suggest_categorical('learning_rate', [0.1, 0.05, 0.01]), # eta
  255.                   'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
  256.                   'min_split_loss': trial.suggest_float('min_split_loss', 1e-8, 1.0, log=True), # gamma
  257.                   'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']), # , 'gblinear'
  258.                   'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True), # lambda
  259.                   'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0)# alpha
  260.                  }
  261.     clf = make_pipeline(XGBClassifier(**params, use_label_encoder=False, num_class = 3))
  262.     scores = cross_val_score(clf, X, y, cv=5)
  263.     return np.mean(scores)
  264.  
  265. def objective_catboost(trial, X, y):
  266.     params = {
  267.         "iterations": 1000,
  268.         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  269.         "depth": trial.suggest_int("depth", 1, 10),
  270.         #"subsample": trial.suggest_float("subsample", 0.05, 1.0),
  271.         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
  272.         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
  273.     }
  274.     clf = make_pipeline(CatBoostClassifier(**params, silent=True))
  275.     scores = cross_val_score(clf, X, y, cv=5)
  276.     return np.mean(scores)
  277.    
  278. def objective_lgbc(trial, X, y):
  279.     params = {
  280.         "objective": "multiclass",
  281.         "metric": "multi_logloss",
  282.         "verbosity": -1,
  283.         "boosting_type": trial.suggest_categorical('boosting_type ', ['gbdt', 'dart']),
  284.         "num_class": 3,
  285.         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
  286.         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
  287.         "num_leaves": trial.suggest_int("num_leaves", 2, 256),
  288.         "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
  289.         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
  290.         "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
  291.         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
  292.     }
  293.     clf = make_pipeline(LGBMClassifier(**params))
  294.     scores = cross_val_score(clf, X, y, cv=5)
  295.     return np.mean(scores)
  296.  
  297. def objective_hgbc(trial, X, y):
  298.     params = {
  299.         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  300.         "max_iter": 1000,
  301.         "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 1, 100),
  302.         'max_depth': trial.suggest_int('max_depth', 1, 9),
  303.         "l2_regularization": trial.suggest_float("l2_regularization", 1e-6, 1e3, log=True),
  304.         "max_bins": trial.suggest_int('max_bins', 2, 255)        
  305.     }
  306.     clf = make_pipeline(HistGradientBoostingClassifier(**params))
  307.     scores = cross_val_score(clf, X, y, cv=5)
  308.     return np.mean(scores)
  309.  
  310. def calculate_score(model, X_train, y_train, X_test, y_test):
  311.     model.fit(X_train, y_train)
  312.     y_pred = model.predict(X_test)
  313.     acc = accuracy_score(y_test, y_pred)
  314.     #auc = roc_auc_score(y_test, y_pred, multi_class= 'ovo')
  315.     f1 = f1_score(y_test, y_pred, average= 'weighted')
  316.     recall = recall_score(y_test, y_pred, average= 'weighted')
  317.     precision = precision_score(y_test, y_pred, average= 'weighted')
  318.    
  319.     return acc, f1, recall, precision
  320.  
  321. # Hyperparameter tunning:
  322. study_gbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  323. study_gbc.optimize(functools.partial(objective_gbc, X=X_train, y=y_train), n_trials=30)
  324.  
  325. study_xgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  326. study_xgbc.optimize(functools.partial(objective_xgbc, X=X_train, y=y_train), n_trials=30)
  327.  
  328. study_catboost = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  329. study_catboost.optimize(functools.partial(objective_catboost, X=X_train, y=y_train), n_trials=30)
  330.  
  331. study_lgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  332. study_lgbc.optimize(functools.partial(objective_lgbc, X=X_train, y=y_train), n_trials=30)
  333.  
  334. study_hgbc = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=0))
  335. study_hgbc.optimize(functools.partial(objective_hgbc, X=X_train, y=y_train), n_trials=30)
  336.  
  337. # Creating Classifiers using tunned parameters obtained from previous step:
  338. gbc = GradientBoostingClassifier(**study_gbc.best_params)
  339. xgbc = XGBClassifier(**study_xgbc.best_params, use_label_encoder=False)
  340. catboost = CatBoostClassifier(**study_catboost.best_params)
  341. lgbc = LGBMClassifier(**study_lgbc.best_params)
  342. hgbc = HistGradientBoostingClassifier(**study_hgbc.best_params)
  343.  
  344. models = [gbc,xgbc,lgbc,gbc,hgbc]
  345. names = ['Gradient Boosting Classifier', 'XGBoost Classifier', 'CatBoost Classifier',
  346.          'Light GBM Classifier', 'Hist Gradient Boosting Classifier']
  347.  
  348. # Calculating various model performance's metrics
  349. scorelist = {}
  350.  
  351.  
  352. for name, classifier in zip(names, models):
  353.     acc, f1, recall, precision = calculate_score(classifier, X_train, y_train, X_test, y_test)
  354.     scorelist[name] = (acc, f1, recall, precision)
  355.  
  356. scorelist
  357.  
  358.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement