Advertisement
Guest User

XGBoost attempt

a guest
Oct 9th, 2017
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.71 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import collections
  4. import copy
  5. import gc
  6. import _pickle as cPickle
  7. import json
  8. import syslog
  9. import MySQLdb
  10. import time
  11. import sys
  12. from sklearn.preprocessing import StandardScaler
  13. from sklearn.cross_validation import KFold
  14. from sklearn.metrics import confusion_matrix
  15. import xgboost as xgb
  16. from xgboost.sklearn import XGBClassifier
  17. from sklearn import cross_validation, metrics #Additional sklearn functions
  18. from sklearn.grid_search import GridSearchCV #Performing grid search
  19.  
  20. import config
  21.  
  22. conn = MySQLdb.connect(user=config.mysql_user,
  23. host=config.mysql_host,
  24. password=config.mysql_password,
  25. database=config.mysql_database)
  26.  
  27. c = conn.cursor()
  28.  
  29. current_day = time.strftime('%Y-%m-%d')
  30. game_id=35427
  31. anticipation_duration = 7
  32. clf_name = 'XGBClassifier'
  33. clf_class = XGBClassifier
  34.  
  35. sql = """select id from predictors
  36. where game_id=%s and p_type='quantifiable'""" % game_id
  37. c.execute(sql)
  38. predictors = c.fetchall()
  39.  
  40.  
  41. sql = "select * from training_data_%s where churned=0" % game_id
  42. churn_0_df = pd.read_sql(sql, conn)
  43.  
  44. sql = "select * from training_data_%s where churned=1" % game_id
  45. churn_1_df = pd.read_sql(sql, conn)
  46.  
  47. to_drop = ['user_id', 'last_game_day']
  48. for i in range(anticipation_duration-1, -1, -1):
  49. for p in predictors:
  50. to_drop.append("col_%s_%s_val" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
  51. to_drop.append("col_%s_%s_avg" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
  52. # undersampling
  53. churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
  54. churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
  55. churn_1_feat_space = churn_1_feat_space.iloc[np.random.randint(0, len(churn_1_df), size=len(churn_0_df))]
  56.  
  57. train = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
  58. target = "churned"
  59. to_drop.append(target)
  60. preds = [x for x in train.columns if x not in to_drop]
  61.  
  62. def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  63. if useTrainCV:
  64. xgb_param = alg.get_xgb_params()
  65. xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
  66. cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
  67. metrics='auc', early_stopping_rounds=early_stopping_rounds)
  68. alg.set_params(n_estimators=cvresult.shape[0])
  69. #Fit the algorithm on the data
  70. alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
  71. #Predict training set:
  72. dtrain_predictions = alg.predict(dtrain[predictors])
  73. dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
  74. #Print model report:
  75. print("\nModel Report")
  76. print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
  77. print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
  78. print("nb estimators: ", alg.get_params()['n_estimators'])
  79. print("feature importance scores: ")
  80. print(pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False))
  81.  
  82.  
  83. xgb1 = XGBClassifier(
  84. learning_rate =0.1,
  85. n_estimators=1000,
  86. max_depth=5,
  87. min_child_weight=1,
  88. gamma=0,
  89. subsample=0.8,
  90. colsample_bytree=0.8,
  91. objective= 'binary:logistic',
  92. nthread=4,
  93. scale_pos_weight=1,
  94. seed=27)
  95. modelfit(xgb1, train, preds)
  96. Model Report
  97. Accuracy : 0.5468
  98. AUC Score (Train): 0.574191
  99.  
  100. # n_estimators = 43
  101.  
  102. # tune max_depth and min_child_weight
  103. param_test1 = {
  104. 'max_depth': [3, 5, 7, 9],
  105. 'min_child_weight': [1, 3, 5]
  106. }
  107. gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1,
  108. n_estimators=43, max_depth=5,
  109. min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
  110. objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
  111. param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
  112.  
  113. gsearch1.fit(train[preds],train[target])
  114. gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
  115. {'max_depth': 5, 'min_child_weight': 1}, 0.542049796287913
  116.  
  117. # tune gamma
  118. param_test3 = {
  119. 'gamma':[i/10.0 for i in range(0,5)]
  120. }
  121.  
  122. gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=43, max_depth=5,
  123. min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
  124. objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
  125. param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
  126. gsearch3.fit(train[preds],train[target])
  127. gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
  128. {'gamma': 0.2}, 0.5426636487098234
  129.  
  130. # tune subsample, colsample_bytree
  131. param_test5 = {
  132. 'subsample':[i/100.0 for i in range(75,90,5)],
  133. 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
  134. }
  135. gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=43, max_depth=5,
  136. min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
  137. objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
  138. param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
  139. gsearch5.fit(train[preds],train[target])
  140. gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
  141. {'colsample_bytree': 0.85, 'subsample': 0.85}, 0.5483186962426437)
  142.  
  143.  
  144. xgb3 = XGBClassifier(
  145. learning_rate =0.01,
  146. n_estimators=1000,
  147. max_depth=5,
  148. min_child_weight=1,
  149. gamma=0,
  150. subsample=0.85,
  151. colsample_bytree=0.85,
  152. reg_alpha=0.005,
  153. objective= 'binary:logistic',
  154. nthread=4,
  155. scale_pos_weight=1,
  156. seed=27)
  157. modelfit(xgb3, train, preds)
  158. Model Report
  159. Accuracy : 0.5489
  160. AUC Score (Train): 0.576144
  161. nb estimators: 42
  162.  
  163.  
  164.  
  165. sql = "select * from training_data_%s where churned=1" % game_id
  166. churn_1_df = pd.read_sql(sql, conn)
  167.  
  168. to_drop = ['user_id', 'last_game_day']
  169. for i in range(anticipation_duration-1, -1, -1):
  170. for p in predictors:
  171. to_drop.append("col_%s_%s_val" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
  172. to_drop.append("col_%s_%s_avg" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
  173.  
  174. # oversampling
  175. churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
  176. churn_0_feat_space = churn_0_feat_space.iloc[np.random.randint(0, len(churn_0_df), size=len(churn_1_df))]
  177. churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
  178. y_1 = churn_1_df['churned']
  179. y_0 = y_1.copy()
  180. for i in range(len(y_0)):
  181. y_0[i] = 0
  182.  
  183. # undersampling
  184. churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
  185. churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
  186. churn_1_feat_space = churn_1_feat_space.iloc[np.random.randint(0, len(churn_1_df), size=len(churn_0_df))]
  187. y_0 = churn_0_df['churned']
  188. y_1 = y_0.copy()
  189. for i in range(len(y_1)):
  190. y_1[i] = 1
  191.  
  192. # equal sampling
  193. churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
  194. churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
  195. churn_feat_space = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
  196. y_0 = churn_0_df['churned']
  197. y_1 = churn_1_df['churned']
  198.  
  199. churn_feat_space = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
  200. X = churn_feat_space.as_matrix().astype(np.float)
  201. y = pd.concat([y_0, y_1], ignore_index=True)
  202.  
  203. scaler = StandardScaler()
  204. X = scaler.fit_transform(X)
  205.  
  206. kf = KFold(len(y),n_folds=5,shuffle=True)
  207. cross_results = {}
  208.  
  209.  
  210. y_pred = y.copy()
  211. # Iterate through folds
  212. for train_index, test_index in kf:
  213. X_train, X_test = X[train_index], X[test_index]
  214. y_train = y[train_index]
  215. # Initialize a classifier with key word arguments
  216. clf = XB(max_depth=3, eval_metric='auc')
  217. clf.fit(X_train,y_train)
  218. y_pred[test_index] = clf.predict(X_test)
  219.  
  220.  
  221. cross_results[clf_name] ={'accuracy':np.mean(y == y_pred).tolist(),
  222. 'confusion_matrix':confusion_matrix(y, y_pred).tolist()}
  223. cross_results
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement