Untitled

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 23 11:40:14 2018

@author: u396415
"""

import pandas as pd
import numpy as np
import sklearn

from datetime import date
from datetime import time

import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator


from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier


h2o.init(max_mem_size = "500M", nthreads = 2)

X = pd.read_csv('H:/Zero/data_g.csv')


## feature engg # creating age cols
X['CCR_CC_CLAIM_LOSSDATE']=pd.to_datetime(X['CCR_CC_CLAIM_LOSSDATE'])
X['POLICY_SC_POLICYINCEPTIONDATE']=pd.to_datetime(X['POLICY_SC_POLICYINCEPTIONDATE'])
X['CLAIMANT_DATEOFBIRTH']=pd.to_datetime(X['CLAIMANT_DATEOFBIRTH'],errors = 'coerce')


X['AGE_AT_INCEPT']=   X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
X['POLICY_AGE']=   X['CCR_CC_CLAIM_LOSSDATE']-X['POLICY_SC_POLICYINCEPTIONDATE']
X['AGE']=   X['CCR_CC_CLAIM_LOSSDATE']-X['CLAIMANT_DATEOFBIRTH']


## bringing in numeric type
X['AGE_AT_INCEPT']=X['AGE_AT_INCEPT'] / np.timedelta64(1, 'D')
X['POLICY_AGE']=X['POLICY_AGE'] / np.timedelta64(1, 'D')
X['AGE']=X['AGE'] / np.timedelta64(1, 'D')


#looking at the data
X.shape
X.dtypes
description = X.describe()
#unwanted columns
unwan_cols= ['CCR_CC_CLAIM_LOSSDATE','CLAIMANT_DATEOFBIRTH','POLICY_SC_POLICYINCEPTIONDATE','CUST_NUM']

cols = [col for col in X.columns if col not in unwan_cols]
X = X[cols]

y=data['CCR_CCTL_SC_FULFILMENTPATH_NAME']


# splitting
train, test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.70, random_state=700)
train.shape
test.shape


train_ID = train['CCR_CC_CLAIM_CLAIMNUMBER']
y_train = train['CCR_CCTL_SC_FULFILMENTPATH_NAME']

#dropping these vars
train.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)
train.drop("CCR_CCTL_SC_FULFILMENTPATH_NAME", axis = 1, inplace = True)
test.drop("CCR_CC_CLAIM_CLAIMNUMBER", axis = 1, inplace = True)

## Categorical feature Indices
cat_feature_indices= np.where(train.dtypes!=np.float)[0]

#total missing values
missmap = train.isnull().sum().to_frame().sort_values(0, ascending = False)
missmap.head()

# few cols with 30% missing values


# removing cols with 1 unique
cols_with_onlyone_val = train.columns[train.nunique() == 1]
train.drop(cols_with_onlyone_val.values, axis=1, inplace=True)
test.drop(cols_with_onlyone_val.values, axis=1, inplace=True)


## comapring the cols
NUM_OF_DECIMALS = 32
train = train.round(NUM_OF_DECIMALS)
test = test.round(NUM_OF_DECIMALS)
colsToRemove = []
columns = train.columns
for i in range(len(columns)-1):
    v = train[columns[i]].values
    dupCols = []
    for j in range(i + 1,len(columns)):


datatypes = train.dtypes

# Generate a train test split and convert to pandas and h2o frames

train_h2o_df = h2o.H2OFrame(train)
train_h2o_df.set_names(list(train.columns))
train_h2o_df['response'] = h2o.H2OFrame(labels_train)
train_h2o_df['response'] = train_h2o_df['response'].asfactor()

test_h2o_df = h2o.H2OFrame(test)
test_h2o_df.set_names(list(X_df.columns))
test_h2o_df['response'] = h2o.H2OFrame(y[labels_test])
test_h2o_df['response'] = test_h2o_df['response'].asfactor()


from sklearn import model_selection
from sklearn import ensemble
NUM_OF_FEATURES = 1000
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(y - pred, 2)))

model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))

col = pd.DataFrame({'importance': model.feature_importances_, 'feature': train.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
train = train[col]
test = test[col]
train.shape


from sklearn.metrics import mean_squared_error

import lightgbm as lgb

model_lgb = lgb.LGBMRegressor(objective='binary',num_leaves=144,
                              learning_rate=0.005, n_estimators=720, max_depth=13,
                              metric='rmse',is_training_metric=True,
                              max_bin = 55, bagging_fraction = 0.8,verbose=-1,
                              bagging_freq = 5, feature_fraction = 0.9)

param_grid = {
    'n_estimators' : [50],
    'max_depth' : range(4,11), #range(4,11),
    #search a large space of row  sampling rates per tree
#    'sample_rate' : 1.0,
    #search a large space of col sampling rates per split
#    'col_sample_rate' : 1.0,
    #search a large space of how column sampling per tree
#    'col_sample_rate_per_tree': 1.0,
    # search a large space of how column sampling per split should change as a function of the depth of the split
#    'col_sample_rate_change_per_level': 1,
    # search a large space of the number of min rows in a terminal node
    'min_samples_leaf': [1,5,10,20,50,100],
    #search a large space of the number of binsfor split-finding for continuous and integer columns
#    'nbins' : [2**x for x in range(4,11)[::2]],
    #search a large space of the number of bins for split-finding for categorical columns
#    'nbins_cats' : [2**x for x in range(4,15)[::2]],
    # search a few minimum required relative error improvement thresholds for a split to happen
    'min_impurity_split': [0], #[0,1e-8,1e-6,1e-4],
    'learning_rate' : [0.15],
    'learning_r' : [0.15],
#    'stopping_rounds': 5,
#    'stopping_tolerance': 1e-4,
#    'stopping_metric': 'misclassification',
#    'balance_classes': [True, False]
}

# for now only using lgb parm grid
params = {
    'objective' :'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 76,
    'feature_fraction': 0.64,
    'bagging_fraction': 0.8,
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'binary_logloss'
}

d_train = lgb.Dataset(train, labels_train)
d_test = lgb.Dataset(test, labels_test)

df_train_ = countvec_df_train.astype('float32')


bst = lgb.train(params, d_train, 5000, verbose_eval=50, early_stopping_rounds=100)


gbm = GradientBoostingClassifier()
grid = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5)
grid.fit(train, labels_train)

best_model = grid.best_estimator_

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve


y_train_pred = best_model.predict(Xtrain)
y_train_pred_prob = best_model.predict_proba(X_train)[:, 1]
y_test_pred = best_model.predict(X_test)
y_test_pred_prob = best_model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(accuracy_score(y_test, y_test_pred))

# Plot roc
# Training
roc_auc = roc_auc_score(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)
fpr, tpr, th = roc_curve(pd.get_dummies(y_train)['Do and Charge'], y_train_pred_prob)

plt.figure()
plt.plot(fpr, tpr, label='AUC=%0.2f' % roc_auc)
plt.plot([0,1],[0,1], color='navy', linestyle='--')
plt.xlim([0,1])
plt.ylim([0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - Train Set')
plt.legend(loc='lower right')
plt.show()


# catboost classification

model=  CatBoostClassifier(iterations= 50, depth =3, learning_rate=0.1)
model.fit(train, y_train, cat_features=cat_feature_indices, plot =True)