Guest User

Untitled

a guest
Jul 23rd, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.31 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Jul 23 11:40:14 2018
  4.  
  5. @author: u396415
  6. """
  7.  
  8. import pandas as pd
  9. import numpy as np
  10. import sklearn
  11.  
  12. from datetime import date
  13. from datetime import time
  14.  
  15. import h2o
  16. from h2o.estimators.random_forest import H2ORandomForestEstimator
  17. from h2o.estimators.gbm import H2OGradientBoostingEstimator
  18.  
  19.  
  20. from sklearn.model_selection import KFold, cross_val_score, train_test_split
  21. from sklearn.model_selection import GridSearchCV
  22. from sklearn.ensemble import GradientBoostingClassifier
  23.  
  24. from catboost import CatBoostClassifier
  25.  
  26.  
  27.  
  28.  
  29. h2o.init(max_mem_size = "500M", nthreads = 2)
  30.  
  31. X = pd.read_csv('H:/Zero/data_g.csv')
  32.  
  33.  
  34. ## feature engg # creating age cols
  35. X['CCR_CC_CLAIM_LOSSDATE']=pd.to_datetime(X['CCR_CC_CLAIM_LOSSDATE'])
  36. X['POLICY_SC_POLICYINCEPTIONDATE']=pd.to_datetime(X['POLICY_SC_POLICYINCEPTIONDATE'])
  37. X['CLAIMANT_DATEOFBIRTH']=pd.to_datetime(X['CLAIMANT_DATEOFBIRTH'],errors = 'coerce')
  38.  
  39.  
  40. X['AGE_AT_INCEPT']= X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
  41. X['POLICY_AGE']= X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
  42. X['AGE']= X['CCR_CC_CLAIM_LOSSDATE']-X['CLAIMANT_DATEOFBIRTH']
  43.  
  44.  
  45. ## bringing in numeric type
  46. X['AGE_AT_INCEPT']=X['AGE_AT_INCEPT'] / np.timedelta64(1, 'D')
  47. X['POLICY_AGE']=X['POLICY_AGE'] / np.timedelta64(1, 'D')
  48. X['AGE']=X['AGE'] / np.timedelta64(1, 'D')
  49.  
  50.  
  51. #looking at the data
  52. X.shape
  53. X.dtypes
  54. description = X.describe()
  55. #unwanted columns
  56. unwan_cols= ['CCR_CC_CLAIM_LOSSDATE','CLAIMANT_DATEOFBIRTH','POLICY_SC_POLICYINCEPTIONDATE','CUST_NUM']
  57.  
  58. cols = [col for col in X.columns if col not in unwan_cols]
  59. X = X[cols]
  60.  
  61. y=data['CCR_CCTL_SC_FULFILMENTPATH_NAME']
  62.  
  63.  
  64.  
  65.  
  66. # splitting
  67. train, test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.70, random_state=700)
  68. train.shape
  69. test.shape
  70.  
  71.  
  72. train_ID = train['CCR_CC_CLAIM_CLAIMNUMBER']
  73. y_train = train['CCR_CCTL_SC_FULFILMENTPATH_NAME']
  74.  
  75. #dropping these vars
  76. train.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)
  77. train.drop("CCR_CCTL_SC_FULFILMENTPATH_NAME", axis = 1, inplace = True)
  78. test.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)
  79.  
  80. ## Categorical feature Indices
  81. cat_feature_indices= np.where(train.dtypes!=np.float)[0]
  82.  
  83. #total missing values
  84. missmap = train.isnull().sum().to_frame().sort_values(0, ascending = False)
  85. missmap.head()
  86.  
  87. # few cols with 30% missing values
  88.  
  89.  
  90. # removing cols with 1 unique
  91. cols_with_onlyone_val = train.columns[train.nunique() == 1]
  92. train.drop(cols_with_onlyone_val.values, axis=1, inplace=True)
  93. test.drop(cols_with_onlyone_val.values, axis=1, inplace=True)
  94.  
  95.  
  96. ## comapring the cols
  97. NUM_OF_DECIMALS = 32
  98. train = train.round(NUM_OF_DECIMALS)
  99. test = test.round(NUM_OF_DECIMALS)
  100. colsToRemove = []
  101. columns = train.columns
  102. for i in range(len(columns)-1):
  103. v = train[columns[i]].values
  104. dupCols = []
  105. for j in range(i + 1,len(columns)):
  106.  
  107.  
  108.  
  109. datatypes = train.dtypes
  110.  
  111. # Generate a train test split and convert to pandas and h2o frames
  112.  
  113. train_h2o_df = h2o.H2OFrame(train)
  114. train_h2o_df.set_names(list(train.columns))
  115. train_h2o_df['response'] = h2o.H2OFrame(labels_train)
  116. train_h2o_df['response'] = train_h2o_df['response'].asfactor()
  117.  
  118. test_h2o_df = h2o.H2OFrame(test)
  119. test_h2o_df.set_names(list(X_df.columns))
  120. test_h2o_df['response'] = h2o.H2OFrame(y[labels_test])
  121. test_h2o_df['response'] = test_h2o_df['response'].asfactor()
  122.  
  123.  
  124. from sklearn import model_selection
  125. from sklearn import ensemble
  126. NUM_OF_FEATURES = 1000
  127. def rmsle(y, pred):
  128. return np.sqrt(np.mean(np.power(y - pred, 2)))
  129.  
  130. model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
  131. model.fit(x1, y1)
  132. print(rmsle(y2, model.predict(x2)))
  133.  
  134. col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train.columns}).sort_values(
  135. by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
  136. train = train[col]
  137. test = test[col]
  138. train.shape
  139.  
  140.  
  141.  
  142. from sklearn.metrics import mean_squared_error
  143.  
  144. import lightgbm as lgb
  145.  
  146. model_lgb = lgb.LGBMRegressor(objective='binary',num_leaves=144,
  147. learning_rate=0.005, n_estimators=720, max_depth=13,
  148. metric='rmse',is_training_metric=True,
  149. max_bin = 55, bagging_fraction = 0.8,verbose=-1,
  150. bagging_freq = 5, feature_fraction = 0.9)
  151.  
  152. param_grid = {
  153. 'n_estimators' : [50],
  154. 'max_depth' : range(4,11), #range(4,11),
  155. #search a large space of row sampling rates per tree
  156. # 'sample_rate' : 1.0,
  157. #search a large space of col sampling rates per split
  158. # 'col_sample_rate' : 1.0,
  159. #search a large space of how column sampling per tree
  160. # 'col_sample_rate_per_tree': 1.0,
  161. # search a large space of how column sampling per split should change as a function of the depth of the split
  162. # 'col_sample_rate_change_per_level': 1,
  163. # search a large space of the number of min rows in a terminal node
  164. 'min_samples_leaf': [1,5,10,20,50,100],
  165. #search a large space of the number of binsfor split-finding for continuous and integer columns
  166. # 'nbins' : [2**x for x in range(4,11)[::2]],
  167. #search a large space of the number of bins for split-finding for categorical columns
  168. # 'nbins_cats' : [2**x for x in range(4,15)[::2]],
  169. # search a few minimum required relative error improvement thresholds for a split to happen
  170. 'min_impurity_split': [0], #[0,1e-8,1e-6,1e-4],
  171. 'learning_rate' : [0.15],
  172. 'learning_r' : [0.15],
  173. # 'stopping_rounds': 5,
  174. # 'stopping_tolerance': 1e-4,
  175. # 'stopping_metric': 'misclassification',
  176. # 'balance_classes': [True, False]
  177. }
  178.  
  179. # for now only using lgb parm grid
  180. params = {
  181. 'objective' :'binary',
  182. 'learning_rate' : 0.02,
  183. 'num_leaves' : 76,
  184. 'feature_fraction': 0.64,
  185. 'bagging_fraction': 0.8,
  186. 'bagging_freq':1,
  187. 'boosting_type' : 'gbdt',
  188. 'metric': 'binary_logloss'
  189. }
  190.  
  191. d_train = lgb.Dataset(train, labels_train)
  192. d_test = lgb.Dataset(test, labels_test)
  193.  
  194. df_train_ = countvec_df_train.astype('float32')
  195.  
  196.  
  197. bst = lgb.train(params, d_train, 5000, verbose_eval=50, early_stopping_rounds=100)
  198.  
  199.  
  200.  
  201.  
  202.  
  203.  
  204. gbm = GradientBoostingClassifier()
  205. grid = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5)
  206. grid.fit(train, labels_train)
  207.  
  208. best_model = grid.best_estimator_
  209.  
  210. from sklearn.metrics import confusion_matrix
  211. from sklearn.metrics import classification_report
  212. from sklearn.metrics import accuracy_score
  213. from sklearn.metrics import roc_auc_score
  214. from sklearn.metrics import roc_curve
  215.  
  216.  
  217. y_train_pred = best_model.predict(Xtrain)
  218. y_train_pred_prob = best_model.predict_proba(X_train)[:, 1]
  219. y_test_pred = best_model.predict(X_test)
  220. y_test_pred_prob = best_model.predict_proba(X_test)[:, 1]
  221. print(classification_report(y_test, y_test_pred))
  222. print(confusion_matrix(y_test, y_test_pred))
  223. print(accuracy_score(y_test, y_test_pred))
  224.  
  225. # Plot roc
  226. # Training
  227. roc_auc = roc_auc_score(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)
  228. fpr, tpr, th = roc_curve(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)
  229.  
  230. plt.figure()
  231. plt.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
  232. plt.plot([0,1],[0,1], color='navy', linestyle='--')
  233. plt.xlim([0,1])
  234. plt.ylim([0,1.05])
  235. plt.xlabel('False Positive Rate')
  236. plt.ylabel('True Positive Rate')
  237. plt.title('ROC - Train Set')
  238. plt.legend(loc='lower right')
  239. plt.show()
  240.  
  241.  
  242.  
  243.  
  244. # catboost classification
  245.  
  246. model= CatBoostClassifier(iterations= 50, depth =3, learning_rate=0.1)
  247. model.fit(train, y_train, cat_features=cat_feature_indices, plot =True)
Add Comment
Please, Sign In to add comment