Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Mon Jul 23 11:40:14 2018
- @author: u396415
- """
- import pandas as pd
- import numpy as np
- import sklearn
- from datetime import date
- from datetime import time
- import h2o
- from h2o.estimators.random_forest import H2ORandomForestEstimator
- from h2o.estimators.gbm import H2OGradientBoostingEstimator
- from sklearn.model_selection import KFold, cross_val_score, train_test_split
- from sklearn.model_selection import GridSearchCV
- from sklearn.ensemble import GradientBoostingClassifier
- from catboost import CatBoostClassifier
- h2o.init(max_mem_size = "500M", nthreads = 2)
- X = pd.read_csv('H:/Zero/data_g.csv')
- ## feature engg # creating age cols
- X['CCR_CC_CLAIM_LOSSDATE']=pd.to_datetime(X['CCR_CC_CLAIM_LOSSDATE'])
- X['POLICY_SC_POLICYINCEPTIONDATE']=pd.to_datetime(X['POLICY_SC_POLICYINCEPTIONDATE'])
- X['CLAIMANT_DATEOFBIRTH']=pd.to_datetime(X['CLAIMANT_DATEOFBIRTH'],errors = 'coerce')
- X['AGE_AT_INCEPT']= X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
- X['POLICY_AGE']= X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
- X['AGE']= X['CCR_CC_CLAIM_LOSSDATE']-X['CLAIMANT_DATEOFBIRTH']
- ## bringing in numeric type
- X['AGE_AT_INCEPT']=X['AGE_AT_INCEPT'] / np.timedelta64(1, 'D')
- X['POLICY_AGE']=X['POLICY_AGE'] / np.timedelta64(1, 'D')
- X['AGE']=X['AGE'] / np.timedelta64(1, 'D')
- #looking at the data
- X.shape
- X.dtypes
- description = X.describe()
- #unwanted columns
- unwan_cols= ['CCR_CC_CLAIM_LOSSDATE','CLAIMANT_DATEOFBIRTH','POLICY_SC_POLICYINCEPTIONDATE','CUST_NUM']
- cols = [col for col in X.columns if col not in unwan_cols]
- X = X[cols]
- y=data['CCR_CCTL_SC_FULFILMENTPATH_NAME']
- # splitting
- train, test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.70, random_state=700)
- train.shape
- test.shape
- train_ID = train['CCR_CC_CLAIM_CLAIMNUMBER']
- y_train = train['CCR_CCTL_SC_FULFILMENTPATH_NAME']
- #dropping these vars
- train.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)
- train.drop("CCR_CCTL_SC_FULFILMENTPATH_NAME", axis = 1, inplace = True)
- test.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)
- ## Categorical feature Indices
- cat_feature_indices= np.where(train.dtypes!=np.float)[0]
- #total missing values
- missmap = train.isnull().sum().to_frame().sort_values(0, ascending = False)
- missmap.head()
- # few cols with 30% missing values
- # removing cols with 1 unique
- cols_with_onlyone_val = train.columns[train.nunique() == 1]
- train.drop(cols_with_onlyone_val.values, axis=1, inplace=True)
- test.drop(cols_with_onlyone_val.values, axis=1, inplace=True)
- ## comapring the cols
- NUM_OF_DECIMALS = 32
- train = train.round(NUM_OF_DECIMALS)
- test = test.round(NUM_OF_DECIMALS)
- colsToRemove = []
- columns = train.columns
- for i in range(len(columns)-1):
- v = train[columns[i]].values
- dupCols = []
- for j in range(i + 1,len(columns)):
- datatypes = train.dtypes
- # Generate a train test split and convert to pandas and h2o frames
- train_h2o_df = h2o.H2OFrame(train)
- train_h2o_df.set_names(list(train.columns))
- train_h2o_df['response'] = h2o.H2OFrame(labels_train)
- train_h2o_df['response'] = train_h2o_df['response'].asfactor()
- test_h2o_df = h2o.H2OFrame(test)
- test_h2o_df.set_names(list(X_df.columns))
- test_h2o_df['response'] = h2o.H2OFrame(y[labels_test])
- test_h2o_df['response'] = test_h2o_df['response'].asfactor()
- from sklearn import model_selection
- from sklearn import ensemble
- NUM_OF_FEATURES = 1000
- def rmsle(y, pred):
- return np.sqrt(np.mean(np.power(y - pred, 2)))
- model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
- model.fit(x1, y1)
- print(rmsle(y2, model.predict(x2)))
- col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train.columns}).sort_values(
- by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
- train = train[col]
- test = test[col]
- train.shape
- from sklearn.metrics import mean_squared_error
- import lightgbm as lgb
- model_lgb = lgb.LGBMRegressor(objective='binary',num_leaves=144,
- learning_rate=0.005, n_estimators=720, max_depth=13,
- metric='rmse',is_training_metric=True,
- max_bin = 55, bagging_fraction = 0.8,verbose=-1,
- bagging_freq = 5, feature_fraction = 0.9)
- param_grid = {
- 'n_estimators' : [50],
- 'max_depth' : range(4,11), #range(4,11),
- #search a large space of row sampling rates per tree
- # 'sample_rate' : 1.0,
- #search a large space of col sampling rates per split
- # 'col_sample_rate' : 1.0,
- #search a large space of how column sampling per tree
- # 'col_sample_rate_per_tree': 1.0,
- # search a large space of how column sampling per split should change as a function of the depth of the split
- # 'col_sample_rate_change_per_level': 1,
- # search a large space of the number of min rows in a terminal node
- 'min_samples_leaf': [1,5,10,20,50,100],
- #search a large space of the number of binsfor split-finding for continuous and integer columns
- # 'nbins' : [2**x for x in range(4,11)[::2]],
- #search a large space of the number of bins for split-finding for categorical columns
- # 'nbins_cats' : [2**x for x in range(4,15)[::2]],
- # search a few minimum required relative error improvement thresholds for a split to happen
- 'min_impurity_split': [0], #[0,1e-8,1e-6,1e-4],
- 'learning_rate' : [0.15],
- 'learning_r' : [0.15],
- # 'stopping_rounds': 5,
- # 'stopping_tolerance': 1e-4,
- # 'stopping_metric': 'misclassification',
- # 'balance_classes': [True, False]
- }
- # for now only using lgb parm grid
- params = {
- 'objective' :'binary',
- 'learning_rate' : 0.02,
- 'num_leaves' : 76,
- 'feature_fraction': 0.64,
- 'bagging_fraction': 0.8,
- 'bagging_freq':1,
- 'boosting_type' : 'gbdt',
- 'metric': 'binary_logloss'
- }
- d_train = lgb.Dataset(train, labels_train)
- d_test = lgb.Dataset(test, labels_test)
- df_train_ = countvec_df_train.astype('float32')
- bst = lgb.train(params, d_train, 5000, verbose_eval=50, early_stopping_rounds=100)
- gbm = GradientBoostingClassifier()
- grid = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5)
- grid.fit(train, labels_train)
- best_model = grid.best_estimator_
- from sklearn.metrics import confusion_matrix
- from sklearn.metrics import classification_report
- from sklearn.metrics import accuracy_score
- from sklearn.metrics import roc_auc_score
- from sklearn.metrics import roc_curve
- y_train_pred = best_model.predict(Xtrain)
- y_train_pred_prob = best_model.predict_proba(X_train)[:, 1]
- y_test_pred = best_model.predict(X_test)
- y_test_pred_prob = best_model.predict_proba(X_test)[:, 1]
- print(classification_report(y_test, y_test_pred))
- print(confusion_matrix(y_test, y_test_pred))
- print(accuracy_score(y_test, y_test_pred))
- # Plot roc
- # Training
- roc_auc = roc_auc_score(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)
- fpr, tpr, th = roc_curve(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)
- plt.figure()
- plt.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
- plt.plot([0,1],[0,1], color='navy', linestyle='--')
- plt.xlim([0,1])
- plt.ylim([0,1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('ROC - Train Set')
- plt.legend(loc='lower right')
- plt.show()
- # catboost classification
- model= CatBoostClassifier(iterations= 50, depth =3, learning_rate=0.1)
- model.fit(train, y_train, cat_features=cat_feature_indices, plot =True)
Add Comment
Please, Sign In to add comment