Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import collections
- import copy
- import gc
- import _pickle as cPickle
- import json
- import syslog
- import MySQLdb
- import time
- import sys
- from sklearn.preprocessing import StandardScaler
- from sklearn.cross_validation import KFold
- from sklearn.metrics import confusion_matrix
- import xgboost as xgb
- from xgboost.sklearn import XGBClassifier
- from sklearn import cross_validation, metrics #Additional sklearn functions
- from sklearn.grid_search import GridSearchCV #Performing grid search
- import config
- conn = MySQLdb.connect(user=config.mysql_user,
- host=config.mysql_host,
- password=config.mysql_password,
- database=config.mysql_database)
- c = conn.cursor()
- current_day = time.strftime('%Y-%m-%d')
- game_id=35427
- anticipation_duration = 7
- clf_name = 'XGBClassifier'
- clf_class = XGBClassifier
- sql = """select id from predictors
- where game_id=%s and p_type='quantifiable'""" % game_id
- c.execute(sql)
- predictors = c.fetchall()
- sql = "select * from training_data_%s where churned=0" % game_id
- churn_0_df = pd.read_sql(sql, conn)
- sql = "select * from training_data_%s where churned=1" % game_id
- churn_1_df = pd.read_sql(sql, conn)
- to_drop = ['user_id', 'last_game_day']
- for i in range(anticipation_duration-1, -1, -1):
- for p in predictors:
- to_drop.append("col_%s_%s_val" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
- to_drop.append("col_%s_%s_avg" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
- # undersampling
- churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
- churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
- churn_1_feat_space = churn_1_feat_space.iloc[np.random.randint(0, len(churn_1_df), size=len(churn_0_df))]
- train = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
- target = "churned"
- to_drop.append(target)
- preds = [x for x in train.columns if x not in to_drop]
- def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
- if useTrainCV:
- xgb_param = alg.get_xgb_params()
- xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
- cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
- metrics='auc', early_stopping_rounds=early_stopping_rounds)
- alg.set_params(n_estimators=cvresult.shape[0])
- #Fit the algorithm on the data
- alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
- #Predict training set:
- dtrain_predictions = alg.predict(dtrain[predictors])
- dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
- #Print model report:
- print("\nModel Report")
- print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
- print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
- print("nb estimators: ", alg.get_params()['n_estimators'])
- print("feature importance scores: ")
- print(pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False))
- xgb1 = XGBClassifier(
- learning_rate =0.1,
- n_estimators=1000,
- max_depth=5,
- min_child_weight=1,
- gamma=0,
- subsample=0.8,
- colsample_bytree=0.8,
- objective= 'binary:logistic',
- nthread=4,
- scale_pos_weight=1,
- seed=27)
- modelfit(xgb1, train, preds)
- Model Report
- Accuracy : 0.5468
- AUC Score (Train): 0.574191
- # n_estimators = 43
- # tune max_depth and min_child_weight
- param_test1 = {
- 'max_depth': [3, 5, 7, 9],
- 'min_child_weight': [1, 3, 5]
- }
- gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1,
- n_estimators=43, max_depth=5,
- min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
- objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
- param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
- gsearch1.fit(train[preds],train[target])
- gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
- {'max_depth': 5, 'min_child_weight': 1}, 0.542049796287913
- # tune gamma
- param_test3 = {
- 'gamma':[i/10.0 for i in range(0,5)]
- }
- gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=43, max_depth=5,
- min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
- objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
- param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
- gsearch3.fit(train[preds],train[target])
- gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
- {'gamma': 0.2}, 0.5426636487098234
- # tune subsample, colsample_bytree
- param_test5 = {
- 'subsample':[i/100.0 for i in range(75,90,5)],
- 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
- }
- gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=43, max_depth=5,
- min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
- objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
- param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
- gsearch5.fit(train[preds],train[target])
- gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
- {'colsample_bytree': 0.85, 'subsample': 0.85}, 0.5483186962426437)
- xgb3 = XGBClassifier(
- learning_rate =0.01,
- n_estimators=1000,
- max_depth=5,
- min_child_weight=1,
- gamma=0,
- subsample=0.85,
- colsample_bytree=0.85,
- reg_alpha=0.005,
- objective= 'binary:logistic',
- nthread=4,
- scale_pos_weight=1,
- seed=27)
- modelfit(xgb3, train, preds)
- Model Report
- Accuracy : 0.5489
- AUC Score (Train): 0.576144
- nb estimators: 42
- sql = "select * from training_data_%s where churned=1" % game_id
- churn_1_df = pd.read_sql(sql, conn)
- to_drop = ['user_id', 'last_game_day']
- for i in range(anticipation_duration-1, -1, -1):
- for p in predictors:
- to_drop.append("col_%s_%s_val" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
- to_drop.append("col_%s_%s_avg" % (config.PREDICTION_ACCOUNTING_RANGE-(i+1), p[0]))
- # oversampling
- churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
- churn_0_feat_space = churn_0_feat_space.iloc[np.random.randint(0, len(churn_0_df), size=len(churn_1_df))]
- churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
- y_1 = churn_1_df['churned']
- y_0 = y_1.copy()
- for i in range(len(y_0)):
- y_0[i] = 0
- # undersampling
- churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
- churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
- churn_1_feat_space = churn_1_feat_space.iloc[np.random.randint(0, len(churn_1_df), size=len(churn_0_df))]
- y_0 = churn_0_df['churned']
- y_1 = y_0.copy()
- for i in range(len(y_1)):
- y_1[i] = 1
- # equal sampling
- churn_0_feat_space = churn_0_df.drop(to_drop, axis=1)
- churn_1_feat_space = churn_1_df.drop(to_drop, axis=1)
- churn_feat_space = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
- y_0 = churn_0_df['churned']
- y_1 = churn_1_df['churned']
- churn_feat_space = pd.concat([churn_0_feat_space, churn_1_feat_space], ignore_index=True)
- X = churn_feat_space.as_matrix().astype(np.float)
- y = pd.concat([y_0, y_1], ignore_index=True)
- scaler = StandardScaler()
- X = scaler.fit_transform(X)
- kf = KFold(len(y),n_folds=5,shuffle=True)
- cross_results = {}
- y_pred = y.copy()
- # Iterate through folds
- for train_index, test_index in kf:
- X_train, X_test = X[train_index], X[test_index]
- y_train = y[train_index]
- # Initialize a classifier with key word arguments
- clf = XB(max_depth=3, eval_metric='auc')
- clf.fit(X_train,y_train)
- y_pred[test_index] = clf.predict(X_test)
- cross_results[clf_name] ={'accuracy':np.mean(y == y_pred).tolist(),
- 'confusion_matrix':confusion_matrix(y, y_pred).tolist()}
- cross_results
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement