Guest User

Untitled

a guest
Feb 16th, 2019
132
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.98 KB | None | 0 0
  1. # Import necessary libraries.
  2. import numpy as np
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5.  
  6.  
  7. # Extracting data from .csv file.
  8. file = 'C:\Users\alhut\OneDrive\Desktop\credit card default project\creditcard_default.csv'
  9. dataset = pd.read_csv(file, index_col='ID')
  10.  
  11. dataset.rename(columns=lambda x: x.lower(), inplace=True)
  12.  
  13.  
  14. # Preparing the data using dummy features (one-hot encoding). Base values are: other_education, female, not_married.
  15. dataset['grad_school'] = (dataset['education'] == 1).astype('int')
  16. dataset['universty'] = (dataset['education'] == 2).astype('int')
  17. dataset['high_school'] = (dataset['education'] == 3).astype('int')
  18. dataset.drop('education', axis=1, inplace=True) # Drops the education column because all the information is available in the features above.
  19.  
  20. dataset['male'] = (dataset['sex'] == 1).astype('int')
  21. dataset.drop('sex', axis=1, inplace=True)
  22.  
  23. dataset['married'] = (dataset['marriage'] == 1).astype('int')
  24. dataset.drop('marriage', axis=1, inplace=True)
  25.  
  26. # In the case of pay features, <= 0 means the payment was not delayed.
  27. pay_features = ['pay_0','pay_2','pay_3','pay_4','pay_5','pay_6']
  28. for p in pay_features:
  29. dataset.loc[dataset[p]<=0, p] = 0
  30.  
  31. dataset.rename(columns={'default_payment_next_month':'default'}, inplace=True) # Renames last column for convenience.
  32.  
  33.  
  34. # Importing objects from sklearn to help with the predictions.
  35. from sklearn.model_selection import train_test_split
  36. from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve
  37. from sklearn.preprocessing import RobustScaler
  38.  
  39.  
  40. # Scaling and fitting the x and y variables and creating the x and y test and train variables.
  41. target_name = 'default'
  42. X = dataset.drop('default', axis=1)
  43. robust_scaler = RobustScaler()
  44. X = robust_scaler.fit_transform(X)
  45. y = dataset[target_name]
  46. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123, stratify=y)
  47.  
  48.  
  49. # Creating a confusion matrix.
  50. def CMatrix(CM, labels=['pay','default']):
  51. df = pd.DataFrame(data=CM, index=labels, columns=labels)
  52. df.index.name='TRUE'
  53. df.columns.name='PREDICTION'
  54. df.loc['TOTAL'] = df.sum()
  55. df['Total'] = df.sum(axis=1)
  56. return df
  57.  
  58.  
  59.  
  60. # Preparing a pandas DataFrame to analyze models (evaluation metrics).
  61. metrics = pd.DataFrame(index=['accuracy', 'precision', 'recall'],
  62. columns=['NULL','LogisticReg','ClassTree','NaiveBayes'])
  63.  
  64.  
  65. #######################
  66. # The Null Model.
  67. y_pred_test = np.repeat(y_train.value_counts().idxmax(), y_test.size)
  68. metrics.loc['accuracy','NULL'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
  69. metrics.loc['precision','NULL'] = precision_score(y_pred=y_pred_test, y_true=y_test)
  70. metrics.loc['recall','NULL'] = recall_score(y_pred=y_pred_test, y_true=y_test)
  71.  
  72. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  73. CMatrix(CM)
  74.  
  75.  
  76. # A. Logistic Regression.
  77. # 1- Import the estimator object (model).
  78. from sklearn.linear_model import LogisticRegression
  79.  
  80. # 2- Create an instance of the estimator.
  81. logistic_regression = LogisticRegression(n_jobs=-1, random_state=15)
  82.  
  83. # 3- Use the trainning data to train the estimator.
  84. logistic_regression.fit(X_train, y_train)
  85.  
  86. # 4- Evaluate the model.
  87. y_pred_test = logistic_regression.predict(X_test)
  88. metrics.loc['accuracy','LogisticReg'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
  89. metrics.loc['precision','LogisticReg'] = precision_score(y_pred=y_pred_test, y_true=y_test)
  90. metrics.loc['recall','LogisticReg'] = recall_score(y_pred=y_pred_test, y_true=y_test)
  91.  
  92. # Confusion Matrix.
  93. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  94. CMatrix(CM)
  95.  
  96.  
  97. # B. Classification Trees.
  98. # 1- Import the estimator object (model).
  99. from sklearn.tree import DecisionTreeClassifier
  100.  
  101. # 2- Create an instance of the estimator.
  102. class_tree = DecisionTreeClassifier(min_samples_split=30, min_samples_leaf=10, random_state=10)
  103.  
  104. # 3- Use the trainning data to train the estimator.
  105. class_tree.fit(X_train, y_train)
  106.  
  107. # 4- Evaluate the model.
  108. y_pred_test = class_tree.predict(X_test)
  109. metrics.loc['accuracy','ClassTree'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
  110. metrics.loc['precision','ClassTree'] = precision_score(y_pred=y_pred_test, y_true=y_test)
  111. metrics.loc['recall','ClassTree'] = recall_score(y_pred=y_pred_test, y_true=y_test)
  112.  
  113. # Confusion Matrix.
  114. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  115. CMatrix(CM)
  116.  
  117.  
  118. # C. Naive Bayes Classifier
  119. # 1- Import the estimator object (model).
  120. from sklearn.naive_bayes import GaussianNB
  121.  
  122. # 2- Create an instance of the estimator.
  123. NBC = GaussianNB()
  124.  
  125. # 3- Use the trainning data to train the estimator.
  126. NBC.fit(X_train, y_train)
  127.  
  128. # 4- Evaluate the model.
  129. y_pred_test = NBC.predict(X_test)
  130. metrics.loc['accuracy','NaiveBayes'] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
  131. metrics.loc['precision','NaiveBayes'] = precision_score(y_pred=y_pred_test, y_true=y_test)
  132. metrics.loc['recall','NaiveBayes'] = recall_score(y_pred=y_pred_test, y_true=y_test)
  133.  
  134. # Confusion Matrix.
  135. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  136. CMatrix(CM)
  137.  
  138.  
  139. #######################
  140. # Comparing the models with percentages.
  141. 100*metrics
  142.  
  143.  
  144. # Comparing the models with a bar graph.
  145. fig, ax = plt.subplots(figsize=(8,5))
  146. metrics.plot(kind='barh', ax=ax)
  147. ax.grid();
  148.  
  149.  
  150. # Adjusting the precision and recall values for the logistic regression model and the Naive Bayes Classifier model.
  151. precision_nb, recall_nb, thresholds_nb = precision_recall_curve(y_true=y_test, probas_pred=NBC.predict_proba(X_test)[:,1])
  152. precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_true=y_test, probas_pred=logistic_regression.predict_proba(X_test)[:,1])
  153.  
  154.  
  155. # Plotting the new values for the logistic regression model and the Naive Bayes Classifier model.
  156. fig, ax = plt.subplots(figsize=(8,5))
  157. ax.plot(precision_nb, recall_nb, label='NaiveBayes')
  158. ax.plot(precision_lr, recall_lr, label='LogisticReg')
  159. ax.set_xlabel('Precision')
  160. ax.set_ylabel('Recall')
  161. ax.set_title('Precision-Recall Curve')
  162. ax.hlines(y=0.5, xmin=0, xmax=1, color='r')
  163. ax.legend()
  164. ax.grid();
  165.  
  166.  
  167. # Creating a confusion matrix for modified Logistic Regression Classifier.
  168. fig, ax = plt.subplots(figsize=(8,5))
  169. ax.plot(thresholds_lr, precision_lr[1:], label='Precision')
  170. ax.plot(thresholds_lr, recall_lr[1:], label='Recall')
  171. ax.set_xlabel('Classification Threshold')
  172. ax.set_ylabel('Precision, Recall')
  173. ax.set_title('Logistic Regression Classifier: Precision-Recall')
  174. ax.hlines(y=0.6, xmin=0, xmax=1, color='r')
  175. ax.legend()
  176. ax.grid();
  177.  
  178.  
  179. # Adjusting the threshold to 0.2.
  180. y_pred_proba = logistic_regression.predict_proba(X_test)[:,1]
  181. y_pred_test = (y_pred_proba >= 0.2).astype('int')
  182.  
  183. # Confusion Matrix.
  184. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  185. print('Recall: ', str(100*recall_score(y_pred=y_pred_test, y_true=y_test)) + '%')
  186. print('Precision: ', str(100*precision_score(y_pred=y_pred_test, y_true=y_test)) + '%')
  187. CMatrix(CM)
  188.  
  189.  
  190. #######################
  191. # Defining a function to make individual predictions.
  192. def make_ind_prediction(new_data):
  193. data = new_data.values.reshape(1, -1)
  194. data = robust_scaler.transform(data)
  195. prob = logistic_regression.predict_proba(data)[0][1]
  196. if prob >= 0.2:
  197. return 'Will default.'
  198. else:
  199. return 'Will pay.'
  200.  
  201.  
  202. # Making individual predictions using given data.
  203. from collections import OrderedDict
  204. new_customer = OrderedDict([('limit_bal', 4000),('age', 50 ),('bill_amt1', 500),
  205. ('bill_amt2', 35509 ),('bill_amt3', 689 ),('bill_amt4', 0 ),
  206. ('bill_amt5', 0 ),('bill_amt6', 0 ), ('pay_amt1', 0 ),('pay_amt2', 35509 ),
  207. ('pay_amt3', 0 ),('pay_amt4', 0 ),('pay_amt5', 0 ), ('pay_amt6', 0 ),
  208. ('male', 1 ),('grad_school', 0 ),('university', 1 ), ('high_school', 0 ),
  209. ('married', 1 ),('pay_0', -1 ),('pay_2', -1 ),('pay_3', -1 ),
  210. ('pay_4', 0),('pay_5', -1), ('pay_6', 0)])
  211.  
  212. new_customer = pd.Series(new_customer)
  213. make_ind_prediction(new_customer)
  214.  
  215. # All your other imports...
  216. from sklearn.naive_bayes import GaussianNB
  217.  
  218. # ...
  219.  
  220. def run_classifier(classifier_type, classifier_kwargs, X_train, y_train, X_test, y_test, metrics):
  221. # 1- Import the estimator object (model).
  222. # 2- Create an instance of the estimator.
  223. classifier = classifier_type(**classifier_kwargs)
  224.  
  225. # 3- Use the trainning data to train the estimator.
  226. classifier.fit(X_train, y_train)
  227.  
  228. # 4- Evaluate the model.
  229. y_pred_test = classifier.predict(X_test)
  230. name = classifier_type.__name__
  231. metrics.loc['accuracy', name] = accuracy_score(y_pred=y_pred_test, y_true=y_test)
  232. metrics.loc['precision', name] = precision_score(y_pred=y_pred_test, y_true=y_test)
  233. metrics.loc['recall', name] = recall_score(y_pred=y_pred_test, y_true=y_test)
  234.  
  235. # Confusion Matrix.
  236. CM = confusion_matrix(y_pred=y_pred_test, y_true=y_test)
  237. CMatrix(CM)
  238.  
  239. return classifier, CM
  240.  
  241. # ...
  242.  
  243. def main():
  244. # ...
  245. naive_bayes, nb_cm = run_classifier(NaiveBayes, {}, X_train, y_train, X_test, y_test)
  246. # etc.
  247.  
  248. plot_pr_curve(naive_bayes, X_test, Y_test)
  249. # etc.
  250.  
  251. if __name__ == '__main__':
  252. main()
Add Comment
Please, Sign In to add comment