Advertisement
Nam_Hoang_Waw

Machine Learning II - exercise 2 - AdaBoost_GradientBoosting

Oct 29th, 2023
1,392
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 21.16 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Sat Oct 28 23:46:48 2023
  4.  
  5. @author: Lenovo
  6. """
  7.  
  8. from sklearn.datasets import load_breast_cancer
  9. from sklearn.model_selection import train_test_split
  10. import matplotlib.pyplot as plt
  11. import numpy as np
  12. import pandas as pd
  13. import seaborn as sns
  14. from  sklearn.ensemble import RandomForestClassifier as rfc
  15. from sklearn.feature_selection import RFECV
  16. from sklearn.preprocessing import StandardScaler
  17. from collections import Counter
  18. from sklearn.feature_selection import RFE
  19. from sklearn.linear_model import LogisticRegression
  20. from sklearn.model_selection import StratifiedKFold
  21. import statsmodels.api as sm
  22. from mlxtend.feature_selection import SequentialFeatureSelector
  23. from sklearn_genetic import GASearchCV
  24. from sklearn.ensemble import AdaBoostClassifier
  25. from sklearn.ensemble import GradientBoostingClassifier
  26. from skopt import BayesSearchCV
  27. import optuna
  28. from sklearn.feature_selection import SelectFromModel
  29. from sklearn.metrics import roc_auc_score
  30. from sklearn.metrics import f1_score
  31. from sklearn.metrics import roc_curve
  32. from sklearn.metrics import precision_score
  33. from sklearn.metrics import accuracy_score
  34. from sklearn.metrics import precision_recall_curve
  35.  
  36. precision_recall_curve
  37. # Load breast cancer data
  38. data = load_breast_cancer()
  39. feature_names = data.feature_names
  40. print(data.DESCR)
  41.  
  42. df = pd.DataFrame(data.data, columns=data.feature_names)
  43.  
  44. df.isnull().sum()
  45. df.isna().sum()
  46. df['target'] = data.target
  47.  
  48. df['target'].value_counts()[1] # number of target = 357
  49. df['target'].value_counts()[0] # number of non target = 212
  50.  
  51. # Splitting the training vs. testing data based on 30-70% ratio
  52. X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=1)
  53.  
  54. X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=1)
  55.  
  56. # Feature scaling:
  57. sc = StandardScaler()
  58. X_train = sc.fit_transform(X_train)
  59. X_val = sc.fit_transform(X_val)
  60. X_test = sc.transform(X_test)  
  61.  
  62. X_train = pd.DataFrame(X_train, columns=df.drop(columns=['target']).columns)
  63. X_val = pd.DataFrame(X_val, columns=df.drop(columns=['target']).columns)
  64. X_test = pd.DataFrame(X_test, columns=df.drop(columns=['target']).columns)
  65.  
  66. f,ax = plt.subplots(figsize=(18, 18))
  67. sns.heatmap(X_train.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
  68.  
  69. # There are quite a lot of variables that are highly correlated => This suggests omission of some variables to avoid
  70. # instability of a model on testing dataset.
  71.  
  72. # feature selection:
  73.  
  74.     # Using recursive feature elimination with random forest classifier to decide the max number of features for logistic regression
  75. number_of_random_states = 30
  76. average_optimal = np.zeros(30)
  77.  
  78. i = 0
  79. for rs in range(number_of_random_states):
  80.     rf_classifier = rfc(random_state = rs)
  81.     rfecv = RFECV(estimator=rf_classifier, step=1, cv=5, scoring='f1')
  82.     rfecv = rfecv.fit(X_train, y_train)
  83.     average_optimal += np.asarray(rfecv.cv_results_["mean_test_score"])
  84.     i = i + 1
  85.     print ('progress ' + str(round(i/number_of_random_states*100)) + '%')
  86.    
  87. average_optimal /= number_of_random_states    
  88. plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), average_optimal)
  89. print("Number of features selected :", np.argmax(average_optimal)+1)
  90. print("Evaluation of the optimal f1 :", np.max(average_optimal))
  91. plt.show()
  92.  
  93. # Number of features selected : 15
  94.  
  95.     # List of top 15 features chosen by REFECV    
  96. most_appearing_features = []
  97.  
  98. for rs in range(10):
  99.     rf_classifier = rfc(random_state=rs)      
  100.     rfe = RFE(estimator=rf_classifier, n_features_to_select=15, step=1)
  101.     rfe = rfe.fit(X_train, y_train)
  102.     most_appearing_features.append(X_train.columns[rfe.support_].tolist())
  103. most_appearing_features = [item for sublist in most_appearing_features for item in sublist]
  104.  
  105. print('Most appearing features :')
  106. Counter(most_appearing_features).most_common(15)
  107.  
  108. # [('mean radius', 10),
  109. # ('mean texture', 10),
  110. # ('mean area', 10),
  111. # ('mean concavity', 10),
  112. # ('mean concave points', 10),
  113. # ('area error', 10),
  114. # ('worst radius', 10),
  115. # ('worst texture', 10),
  116. # ('worst perimeter', 10),
  117. # ('worst area', 10),
  118. # ('worst smoothness', 10),
  119. # ('worst concavity', 10),
  120. # ('worst concave points', 10),
  121. # ('mean perimeter', 9),
  122. # ('worst compactness', 9)]
  123.  
  124. # Construct correlation matrix of the top 15 features:
  125. X_train_selected = X_train[['mean radius', 'mean texture', 'mean area', 'mean concavity', 'mean concave points',
  126.                             'area error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness',
  127.                             'worst concavity', 'worst concave points', 'mean perimeter', 'worst compactness']]
  128.  
  129.    
  130. f,ax = plt.subplots(figsize=(18, 18))
  131. sns.heatmap(X_train_selected.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
  132.  
  133. # Fitting these 15 features to the logit model from statsmodels library to check if all of them are statistically significant
  134. X = X_train_selected
  135. Y = y_train.reset_index()
  136. Y = Y['target']
  137. logit_model = sm.Logit(Y, X)
  138. logit_result = logit_model.fit(method="nm", maxiter=5000)
  139. print(logit_result.summary2())
  140.  
  141. # mean texture, mean concavity, mean concave points, worst perimeter, worst smoothness are statistically significant
  142.  
  143. X_train_selected_final = X_train[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
  144.                                           'mean concavity']]
  145.  
  146. X_test_selected = X_test[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
  147.                                           'mean concavity']]
  148.  
  149. X_val_selected = X_val[['mean concave points', 'worst perimeter', 'worst smoothness', 'mean concave points',
  150.                                           'mean concavity']]
  151.  
  152. # Creating logistic model using sklearn based on the selected variables:
  153. model = LogisticRegression(solver='liblinear', max_iter=500, penalty = 'l1')
  154. model.fit(X_train_selected_final, Y)
  155. coefficients = model.coef_
  156.  
  157. y_pred = model.predict(X_train_selected)
  158. f1_in_sample = f1_score(y_train, y_pred)
  159. print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975
  160.  
  161. y_pred = model.predict(X_test_selected)
  162. f1_out_sample = f1_score(y_test, y_pred)
  163. print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out-of-sample F1 score = 0.959
  164.  
  165. # Hyperparameter tunning with Optuna:
  166.  
  167. def logit_objective(trial: optuna.trial.Trial):
  168.     C = trial.suggest_float("C", 0.001, 10)
  169.     penalty = trial.suggest_categorical("penalty", ["l1", "l2"])    
  170.     model = LogisticRegression(solver='liblinear', C=C, penalty=penalty)
  171.     model.fit(X_train_selected, y_train)
  172.    
  173.     y_pred = model.predict(X_val_selected)
  174.     f1 = f1_score(y_val, y_pred)
  175.    
  176.     return f1
  177.  
  178. sampler = optuna.samplers.TPESampler(seed=999)
  179. study = optuna.create_study(direction="maximize", sampler=sampler)
  180. study.optimize(logit_objective, n_trials=100)
  181.  
  182. print("Number of finished trials: ", len(study.trials))
  183. print("Best trial:")
  184. trial = study.best_trial
  185. print("  Value: ", trial.value)
  186. print("  Params: ")
  187. for key, value in trial.params.items():
  188.     print("    {}: {}".format(key, value))
  189.  
  190. # Best trial:
  191. #  Value:  0.9772727272727273
  192. #  Params:
  193. #    C: 8.0344769727568
  194. #    penalty: l1
  195.  
  196. logistic_model = LogisticRegression(solver='liblinear', penalty= trial.params["penalty"], C = trial.params["C"], max_iter=10000, random_state=0)
  197. logistic_model_fit = logistic_model.fit(X_train_selected, y_train)
  198. y_prob = logistic_model_fit.predict_log_proba(X_test_selected)[:, 1]
  199. precision, recall, _ = precision_recall_curve(y_test, y_prob)
  200.  
  201. plt.figure(figsize=(8, 6))
  202. plt.plot(recall, precision, marker='.')
  203. plt.xlabel('Recall')
  204. plt.ylabel('Precision')
  205. plt.title('Precision-Recall Curve')
  206. plt.show()
  207.  
  208.  
  209. y_pred = logistic_model.predict(X_train_selected)
  210. f1_in_sample = f1_score(y_train, y_pred)
  211. print(f"F1 Score on Test Set: {f1_in_sample:.3f}") # in sample F1 score = 0.975
  212.  
  213.  
  214. y_pred = logistic_model.predict(X_test_selected)
  215. f1_out_sample = f1_score(y_test, y_pred)
  216. print(f"F1 Score on Test Set: {f1_out_sample:.3f}") # out of sample F1 score = 0.959
  217.  
  218. # Look like there is no improvement on both in sample and out of sample performance after hyperparameter tunning
  219.  
  220. ############### ADA BOOST ###############
  221.  
  222. # Fiting AdaBoost model with default hyperparameter:
  223.    
  224. ada_model = AdaBoostClassifier(random_state=1)
  225. ada_model.fit(X_train, y_train)
  226.  
  227. # In sample F1 score
  228. y_pred = ada_model.predict(X_train)
  229. print(f1_score(y_train, y_pred)) # f1_score = 1.0
  230.  
  231. # Out of sample F1 score
  232. y_pred = ada_model.predict(X_test)
  233. print(f1_score(y_test, y_pred)) # f1_score = 0.95238
  234.  
  235. # Hyperparameter tunning using Optuna:
  236.    
  237. def ADA_objective(trial):
  238.     n_estimators = trial.suggest_int("n_estimators", 50, 500)
  239.     learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
  240.     model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
  241.     model.fit(X_train, y_train)
  242.     y_pred = model.predict(X_val)
  243.     return f1_score(y_val, y_pred)
  244.  
  245. sampler = optuna.samplers.TPESampler(seed=1)
  246. study = optuna.create_study(direction="maximize", sampler=sampler)
  247. study.optimize(ADA_objective, n_trials=100)
  248.  
  249. print("Number of finished trials: ", len(study.trials))
  250. print("Best trial:")
  251. trial = study.best_trial
  252. print("  Value: ", trial.value)
  253. print("  Params: ")
  254. for key, value in trial.params.items():
  255.     print("    {}: {}".format(key, value))
  256.    
  257. # Best trial:
  258. #  Value:  0.98876
  259. #  Params:
  260. #    n_estimators: 238
  261. #    learning_rate: 0.2758347554916674
  262.  
  263. ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
  264. ada_model.fit(X_train, y_train)
  265.  
  266. feature_importances = ada_model.feature_importances_
  267. sorted_idx = feature_importances.argsort()[::-1]
  268.  
  269. plt.figure(figsize=(10, 6))
  270. plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
  271. plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
  272. plt.xlabel("Feature")
  273. plt.ylabel("Feature Importance")
  274. plt.title("Feature Importances in AdaBoost Model")
  275. plt.show()
  276.  
  277. # mean perimeter, smoothness error, mean smoothness and mean radius has feature importance = 0!!
  278.  
  279. y_pred = ada_model.predict(X_test)
  280.  
  281. print(f1_score(y_test, y_pred)) # f1_score = 0.97260
  282.  
  283. # construct correlation matrix of the remaining features
  284.  
  285. X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius'])
  286.  
  287. f,ax = plt.subplots(figsize=(18, 18))
  288. sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
  289.  
  290. # radius error has almost perfect correlation with area error => drop radius error as it has lower feature importance score
  291. # worst perimeter, worst area, mean area have almost perfect correlation with worst radius => drop those 3 as they have lower feature importance score
  292.  
  293. X_train_selected_features = X_train.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
  294.                                                   'radius error', 'worst perimeter', 'worst area', 'mean area'])
  295.  
  296. X_test_selected_features = X_test.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
  297.                                                   'radius error', 'worst perimeter', 'worst area', 'mean area'])
  298.  
  299. X_val_selected_features = X_val.drop(columns=['mean perimeter', 'smoothness error', 'mean smoothness', 'mean radius',
  300.                                                   'radius error', 'worst perimeter', 'worst area', 'mean area'])
  301.  
  302.  
  303.     # Hyperparameter tunning on selected features:
  304. def ADA_objective(trial):
  305.     n_estimators = trial.suggest_int("n_estimators", 50, 500)
  306.     learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
  307.     model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=1)
  308.     model.fit(X_train_selected_features, y_train)
  309.     y_pred = model.predict(X_val_selected_features)
  310.     return f1_score(y_val, y_pred)
  311.  
  312. sampler = optuna.samplers.TPESampler(seed=1)
  313. study = optuna.create_study(direction="maximize", sampler=sampler)
  314. study.optimize(ADA_objective, n_trials=100)
  315.  
  316. print("Number of finished trials: ", len(study.trials))
  317. print("Best trial:")
  318. trial = study.best_trial
  319. print("  Value: ", trial.value)
  320. print("  Params: ")
  321. for key, value in trial.params.items():
  322.     print("    {}: {}".format(key, value))
  323.    
  324. # Best trial:
  325. #  Value:  0.98876 an imporovement in in-sample f1 score
  326. #  Params:
  327. #    n_estimators: 142
  328. #    learning_rate: 0.5704727088203682
  329.  
  330. ada_model = AdaBoostClassifier(n_estimators=trial.params["n_estimators"], learning_rate=trial.params["learning_rate"], random_state=1)
  331. ada_model.fit(X_train_selected_features, y_train)
  332.  
  333. feature_importances = ada_model.feature_importances_
  334. sorted_idx = feature_importances.argsort()[::-1]
  335.  
  336. plt.figure(figsize=(10, 6))
  337. plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
  338. plt.xticks(range(X_train_selected_features.shape[1]), ada_model.feature_names_in_, rotation=90)
  339. plt.xlabel("Feature")
  340. plt.ylabel("Feature Importance")
  341. plt.title("Feature Importances in ADA Boost Model")
  342. plt.show()
  343.  
  344. y_pred = ada_model.predict(X_test_selected_features)
  345.  
  346. print(f1_score(y_test, y_pred)) # out-of-sample f1_score = 0.95890 which is worse than f1_score = 0.97260 before dropping features
  347.  
  348. ############### GRADIENT BOOSTING ###############
  349.  
  350. # Fiting GBM model with default hyperparameter:
  351.    
  352. GB_model = GradientBoostingClassifier(random_state=1)
  353. GB_model.fit(X_train, y_train)
  354.  
  355. # In sample F1 score
  356. y_pred = GB_model.predict(X_train)
  357. print(f1_score(y_train, y_pred)) # f1_score = 1.0
  358.  
  359. # Out of sample F1 score
  360. y_pred = GB_model.predict(X_test)
  361. print(f1_score(y_test, y_pred)) # f1_score = 0.9517
  362.  
  363. # Using Hyperparameter tunning from Optuna library
  364.  
  365. def GBM_objective(trial):
  366.     n_estimators = trial.suggest_int("n_estimators", 50, 500)
  367.     learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
  368.     max_depth = trial.suggest_int("max_depth", 2, 10)
  369.     subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
  370.     model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
  371.                                        max_depth=max_depth, subsample=subsample, random_state=1)
  372.     model.fit(X_train, y_train)
  373.     y_pred = model.predict(X_val)
  374.     return f1_score(y_val, y_pred)
  375.  
  376. sampler = optuna.samplers.TPESampler(seed=1)
  377. study = optuna.create_study(direction="maximize", sampler=sampler)
  378. study.optimize(GBM_objective, n_trials=100)
  379.  
  380. print("Number of finished trials: ", len(study.trials))
  381. print("Best trial:")
  382. trial = study.best_trial
  383. print("  Value: ", trial.value)
  384. print("  Params: ")
  385. for key, value in trial.params.items():
  386.     print("    {}: {}".format(key, value))
  387.    
  388. # Best trial:
  389. #  Value:  1.0
  390. #  Params:
  391. #    n_estimators: 98
  392. #    learning_rate: 0.5257150170904044
  393. #    max_depth: 4
  394. #    subsample: 0.9
  395.  
  396. GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
  397.                                        learning_rate=trial.params["learning_rate"],
  398.                                        max_depth = trial.params["max_depth"],
  399.                                        subsample = trial.params["subsample"], random_state=1)
  400. GB_model.fit(X_train, y_train)
  401.  
  402. # Visualize the feature importances in the model
  403. feature_importances = GB_model.feature_importances_
  404. sorted_idx = feature_importances.argsort()[::-1]
  405.  
  406. plt.figure(figsize=(10, 6))
  407. plt.bar(range(X_train.shape[1]), feature_importances[sorted_idx], align="center")
  408. plt.xticks(range(X_train.shape[1]), data.feature_names[sorted_idx], rotation=90)
  409. plt.xlabel("Feature")
  410. plt.ylabel("Feature Importance")
  411. plt.title("Feature Importances in Gradient Boosting Model")
  412. plt.show()
  413.  
  414. # Based on the feature importance graph, worst perimeter, worse concave points, worst texture, mean concave points, concave points error and worst radius are the most important features
  415. # Lets build a correlation matrix to see if among the top features, there exists any pair with high correlation
  416.  
  417. X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error', 'worst radius']]
  418.  
  419. f,ax = plt.subplots(figsize=(18, 18))
  420. sns.heatmap(X_train_selected_features.corr(method='spearman'), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
  421. # worst radius has perfect correlation with worse perimeter and given worst perimeter has the highest importance value, we then drop worst radius
  422. # Final selected variables: 'worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error'
  423.  
  424. # Before using the selected variables, we could check for out-of-sample result based on current model specification
  425. y_pred = GB_model.predict(X_test)
  426.  
  427. print(f1_score(y_test, y_pred)) # f1_score = 0.9726
  428.  
  429. # Building GBM based on selected features:
  430.     #Selected features based on spearman correlation analysis:
  431. X_train_selected_features = X_train[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]    
  432. X_test_selected_features = X_test[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
  433. X_val_selected_features = X_val[['worst perimeter', 'worst concave points', 'worst texture', 'mean concave points', 'concave points error']]
  434.  
  435.     # Hyperparameter tunning on selected features:
  436. def GBM_objective(trial):
  437.     n_estimators = trial.suggest_int("n_estimators", 50, 500)
  438.     learning_rate = trial.suggest_float("learning_rate", 0.001, 1.0, log=True)
  439.     max_depth = trial.suggest_int("max_depth", 2, 10)
  440.     subsample = trial.suggest_float("subsample", 0.1, 1.0, step = 0.1)
  441.     model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
  442.                                        max_depth=max_depth, subsample=subsample, random_state=1)
  443.     model.fit(X_train_selected_features, y_train)
  444.     y_pred = model.predict(X_val_selected_features)
  445.     return f1_score(y_val, y_pred)
  446.  
  447. sampler = optuna.samplers.TPESampler(seed=1)
  448. study = optuna.create_study(direction="maximize", sampler=sampler)
  449. study.optimize(GBM_objective, n_trials=100)
  450.  
  451. print("Number of finished trials: ", len(study.trials))
  452. print("Best trial:")
  453. trial = study.best_trial
  454. print("  Value: ", trial.value)
  455. print("  Params: ")
  456. for key, value in trial.params.items():
  457.     print("    {}: {}".format(key, value))
  458.  
  459. # Best trial:
  460. #  Value:  1.0
  461. #  Params:
  462. #    n_estimators: 359
  463. #    learning_rate: 0.31906341975134206
  464. #    max_depth: 2
  465. #    subsample: 0.8
  466.  
  467. GB_model = GradientBoostingClassifier(n_estimators=trial.params["n_estimators"],
  468.                                        learning_rate=trial.params["learning_rate"],
  469.                                        max_depth = trial.params["max_depth"],
  470.                                        subsample = trial.params["subsample"], random_state=1)
  471. GB_model.fit(X_train_selected_features, y_train)
  472.  
  473. feature_importances = GB_model.feature_importances_
  474. sorted_idx = feature_importances.argsort()[::-1]
  475.  
  476. plt.figure(figsize=(10, 6))
  477. plt.bar(range(X_train_selected_features.shape[1]), feature_importances[sorted_idx], align="center")
  478. plt.xticks(range(X_train_selected_features.shape[1]), GB_model.feature_names_in_, rotation=90)
  479. plt.xlabel("Feature")
  480. plt.ylabel("Feature Importance")
  481. plt.title("Feature Importances in Gradient Boosting Model")
  482. plt.show()
  483.  
  484. y_pred = GB_model.predict(X_test_selected_features)
  485.  
  486. print(f1_score(y_test, y_pred)) # f1_score = 0.9796 => better than f1_score = 0.9726 (before dropping worst radius)
  487.  
  488. ############# MODEL COMPARISON #############
  489.  
  490. # Comparison of performance is done by comparing out of sample F1-score before and after hyperparameter tunning:
  491.  
  492.     # Logistic model with default hyperparameter + feature selection: 0.952
  493.     # Logistic model after hyperparameter tunning + feature selection: 0.952
  494.    
  495.     # AdaBoost model with default hyperparameter: 0.95238
  496.     # AdaBoost model after hyperparameter tunning: 0.97260
  497.     # AdaBoost model after hyperparameter tunning + feature selection: 0.9589
  498.    
  499.     # GB model with default hyperparameter: 0.9517
  500.     # GB model after hyperparameter tunning: 0.9726
  501.     # GB model after hyperparameter tunning + feature selection: 0.9796
  502.  
  503. # Both AdaBoost and GB perform better than Logistic Regression after hyperparameter tunning and feature selection
  504. # which is evident in higher F1 score of both Boosting models than that of logistic regression
  505.  
  506. # hyperparameter tunning has positive impact on both AdaBoost and Gradient Boosting models as the F1 score increased
  507. # slightly after tunning took place. On the other hand, hyperparameter has almost no impact on logistic model. Perhaps,
  508. # logistic model has much simpler hyperparameters compared to the other
  509.  
  510. # Feature selection based on feature importance + spearman correlation has positive impact on Gradient Boosting while it
  511. # has negative impact on AdaBoost and almost no impact on logistic model.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement