Guest User

Untitled

a guest
Jun 17th, 2018
267
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.93 KB | None | 0 0
  1. # View the structure of loan_data
  2. str(loan_data)
  3. # Load the gmodels package
  4. library(gmodels)
  5. # Call CrossTable() on loan_status
  6. CrossTable(loan_data$loan_status)
  7. # Call CrossTable() on grade and loan_status
  8. CrossTable(loan_data$grade,loan_data$loan_status,prop.r=T,prop.c=F,prop.t=F,prop.chisq=F)
  9. CrossTable(loan_data$loan_status,loan_data$grade,prop.r=T,prop.c=F,prop.t=F,prop.chisq=F)
  10.  
  11. # Create histogram of loan_amnt: hist_1
  12. hist_1<-hist(loan_data$loan_amnt)
  13. # Print locations of the breaks in hist_1
  14. hist_1$breaks
  15. # Change number of breaks and add labels: hist_2
  16. hist_2 <- hist(loan_data$loan_amnt, breaks =200, xlab = "Loan amount",main = "Histogram of the loan amount")
  17.  
  18. # Plot the age variable
  19. plot(loan_data$age, ylab = "Age")
  20.  
  21. # Save the outlier's index to index_highage
  22. index_highage <- which(loan_data$age > 122)
  23.  
  24. # Create data set new_data with outlier deleted
  25. new_data <- loan_data[-index_highage, ]
  26.  
  27. # Make bivariate scatterplot of age and annual income
  28. plot(loan_data$age, loan_data$annual_inc, xlab = "Age", ylab = "Annual Income")
  29.  
  30. # Look at summary of loan_data
  31. summary(loan_data$int_rate)
  32.  
  33. # Get indices of missing interest rates: na_index
  34. na_index <- which(is.na(loan_data$int_rate))
  35. # Remove observations with missing interest rates: loan_data_delrow_na
  36. loan_data_delrow_na <- loan_data[-na_index, ]
  37. # Make copy of loan_data
  38. loan_data_delcol_na <- loan_data
  39. # Delete interest rate column from loan_data_delcol_na
  40. loan_data_delcol_na$int_rate <- NULL
  41.  
  42. # Compute the median of int_rate
  43. median_ir<-median(loan_data$int_rate,na.rm=T)
  44. # Make copy of loan_data
  45. loan_data_replace <- loan_data
  46. # Replace missing interest rates with median
  47. loan_data_replace$int_rate[is.na(loan_data_replace$int_rate)] <-median_ir
  48. # Check if the NAs are gone
  49. summary(loan_data_replace$int_rate)
  50.  
  51. # Make the necessary replacements in the coarse classification example below
  52. loan_data$ir_cat <- rep(NA, length(loan_data$int_rate))
  53.  
  54. loan_data$ir_cat[which(loan_data$int_rate <= 8)] <- "0-8"
  55. loan_data$ir_cat[which(loan_data$int_rate > 8 & loan_data$int_rate <= 11)] <- "8-11"
  56. loan_data$ir_cat[which(loan_data$int_rate > 11 & loan_data$int_rate <= 13.5)] <- "11-13.5"
  57. loan_data$ir_cat[which(loan_data$int_rate > 13.5)] <- "13.5+"
  58. loan_data$ir_cat[which(is.na(loan_data$int_rate))] <- "Missing"
  59.  
  60. loan_data$ir_cat <- as.factor(loan_data$ir_cat)
  61.  
  62. # Look at your new variable using plot()
  63. plot(loan_data$ir_cat)
  64.  
  65. # Set seed of 567
  66. set.seed(567)
  67. # Store row numbers for training set: index_train
  68. index_train<-sample(1:nrow(loan_data),round(2/3*nrow(loan_data)))
  69. # Create training set: training_set
  70. training_set <- loan_data[index_train,]
  71. # Create test set: test_set
  72. test_set<-loan_data[-index_train,]
  73.  
  74. # Create confusion matrix
  75. conf_matrix <- table(test_set$loan_status, model_pred)
  76. # Compute classification accuracy
  77. (6092 + 349) / nrow(test_set)
  78. # Compute sensitivity
  79. 349 / 1037
  80.  
  81. # Build a glm model with variable ir_cat as a predictor
  82. log_model_cat<-glm(loan_status~ir_cat,family="binomial",data=training_set)
  83. # Print the parameter estimates
  84. log_model_cat
  85. # Look at the different categories in ir_cat using table()
  86. table(training_set$ir_cat)
  87. table(loan_data$ir_cat)
  88.  
  89. # Build the logistic regression model
  90. log_model_multi<-glm(loan_status~age+ir_cat+grade+loan_amnt+annual_inc,family="binomial",data=training_set)
  91. # Obtain significance levels using summary()
  92. summary(log_model_multi)
  93.  
  94. # Build the logistic regression model
  95. predictions_all_small <- predict(log_model_small, newdata = test_set, type = "response")
  96. # Look at the range of the object "predictions_all_small"
  97. predictions_all_small
  98. range(predictions_all_small)
  99.  
  100. # Change the code below to construct a logistic regression model using all available predictors in the data set
  101. log_model_full<-glm(loan_status~.,family="binomial",data=training_set)
  102. # Make PD-predictions for all test set elements using the the full logistic regression model
  103. predictions_all_full<-predict(log_model_full,newdata=test_set,type="response")
  104. # Look at the predictions range
  105. range(predictions_all_full)
  106.  
  107. # The code for the logistic regression model and the predictions is given below
  108. log_model_full <- glm(loan_status ~ ., family = "binomial", data = training_set)
  109. predictions_all_full <- predict(log_model_full, newdata = test_set, type = "response")
  110. # Make a binary predictions-vector using a cut-off of 15%
  111. pred_cutoff_15<-ifelse(predictions_all_full>0.15,1,0)
  112. # Construct a confusion matrix
  113. table(test_set$loan_status,pred_cutoff_15)
  114.  
  115. # Fit the logit, probit and cloglog-link logistic regression models
  116. log_model_logit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
  117. family = binomial(link = logit), data = training_set)
  118. log_model_probit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
  119. family = binomial(link = probit), data = training_set)
  120. log_model_cloglog <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
  121. family = binomial(link = cloglog), data = training_set)
  122.  
  123. # Make predictions for all models using the test set
  124. predictions_logit <- predict(log_model_logit, newdata = test_set, type = "response")
  125. predictions_probit <- predict(log_model_probit, newdata = test_set, type = "response")
  126. predictions_cloglog <- predict(log_model_cloglog, newdata = test_set, type = "response")
  127.  
  128. # Use a cut-off of 14% to make binary predictions-vectors
  129. cutoff <- 0.14
  130. class_pred_logit <- ifelse(predictions_logit > cutoff, 1, 0)
  131. class_pred_probit <- ifelse(predictions_probit > cutoff, 1, 0)
  132. class_pred_cloglog <- ifelse(predictions_cloglog > cutoff, 1, 0)
  133.  
  134. # Make a confusion matrix for the three models
  135. tab_class_logit <- table(true_val, class_pred_logit)
  136. tab_class_probit <- table(true_val, class_pred_probit)
  137. tab_class_cloglog <- table(true_val, class_pred_cloglog)
  138.  
  139. # Compute the classification accuracy for all three models
  140. acc_logit <- sum(diag(tab_class_logit)) / nrow(test_set)
  141. acc_probit <- sum(diag(tab_class_probit)) / nrow(test_set)
  142. acc_cloglog <- sum(diag(tab_class_cloglog)) / nrow(test_set)
  143.  
  144. # The Gini-measure of the root node is given below
  145. gini_root <- 2 * 89 / 500 * 411 / 500
  146. # Compute the Gini measure for the left leaf node
  147. gini_ll <- 2 * 45 / 446 * 401 / 446
  148. # Compute the Gini measure for the right leaf node
  149. gini_rl <- 2 * 10 / 54 * 44 / 54
  150. # Compute the gain
  151. gain <- gini_root - 446 / 500 * gini_ll - 54 / 500 * gini_rl
  152. gain
  153. # compare the gain-column in small_tree$splits with our computed gain, multiplied by 500, and assure they are the same
  154. small_tree$splits
  155. improve <- gain * 500
  156.  
  157. # Load package rpart in your workspace.
  158. library(rpart)
  159. # Change the code provided in the video such that a decision tree is constructed using the undersampled training set. Include rpart.control to relax the complexity parameter to 0.001.
  160. tree_undersample <- rpart(loan_status ~ ., method = "class",data = undersampled_training_set,control = rpart.control(cp = 0.001))
  161. tree_undersample
  162. # Plot the decision tree
  163. plot(tree_undersample, uniform = TRUE)
  164. # Add labels to the decision tree
  165. text(tree_undersample)
  166.  
  167. # Change the code below such that a tree is constructed with adjusted prior probabilities.
  168. tree_prior <- rpart(loan_status ~ ., method = "class",data = training_set,control=rpart.control(cp=0.001),parms=list(prior=c(0.7,0.3)))
  169. # Plot the decision tree
  170. plot(tree_prior,uniform=T)
  171. # Add labels to the decision tree
  172. text(tree_prior)
  173.  
  174. # Change the code below such that a decision tree is constructed using a loss matrix penalizing 10 times more heavily for misclassified defaults.
  175. tree_loss_matrix <- rpart(loan_status ~ ., method = "class",data = training_set,parms=list(loss=matrix(c(0,10,1,0),ncol=2)),control=rpart.control(cp=0.001))
  176. # Plot the decision tree
  177. plot(tree_loss_matrix,uniform=T)
  178. # Add labels to the decision tree
  179. text(tree_loss_matrix)
  180.  
  181. # Plot the cross-validated error rate as a function of the complexity parameter
  182. plotcp(tree_prior)
  183. # Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized
  184. printcp(tree_prior)
  185. # Create an index for of the row with the minimum xerror
  186. index <- which.min(tree_prior$cptable[, "xerror"])
  187. # Create tree_min
  188. tree_min <- tree_prior$cptable[index, "CP"]
  189. # Prune the tree using tree_min
  190. ptree_prior <- prune(tree_prior, cp = tree_min)
  191. # Use prp() to plot the pruned tree
  192. prp(ptree_prior)
  193.  
  194. # set a seed and run the code to construct the tree with the loss matrix again
  195. set.seed(345)
  196. tree_loss_matrix <- rpart(loan_status ~ ., method = "class", data = training_set,parms = list(loss=matrix(c(0, 10, 1, 0), ncol = 2)),control = rpart.control(cp = 0.001))
  197. # Plot the cross-validated error rate as a function of the complexity parameter
  198. printcp(tree_loss_matrix)
  199. cps<-printcp(tree_loss_matrix)[,1]
  200. cps
  201. rpart.plot(tree_loss_matrix)
  202. # Prune the tree using cp = 0.0012788
  203. plotcp(tree_loss_matrix)
  204. ptree_loss_matrix<-prune(tree_loss_matrix,cp=0.0012788)
  205. # Use prp() and argument extra = 1 to plot the pruned tree
  206. prp(ptree_loss_matrix)
  207. rpart.plot(ptree_loss_matrix)
  208. prp(tree_loss_matrix,extra=1)
  209.  
  210. # set a seed and run the code to obtain a tree using weights, minsplit and minbucket
  211. set.seed(345)
  212. tree_weights <- rpart(loan_status ~ ., method = "class",data = training_set,control = rpart.control(minsplit =5, minbucket =2, cp = 0.001),weight=case_weights)
  213. # Plot the cross-validated error rate for a changing cp
  214. plotcp(tree_weights)
  215. # Create an index for of the row with the minimum xerror
  216. index <- which.min(tree_weights$cptable[ , "xerror"])
  217. # Create tree_min
  218. tree_min <- tree_weights$cp[index, "CP"]
  219. # Prune the tree using tree_min
  220. ptree_weights<-prune(tree_weights,c=tree_min)
  221. # Plot the pruned tree using the rpart.plot()-package
  222. rpart.plot(ptree_weights)
  223. prp(ptree_weights,extra=1)
  224.  
  225. # Make predictions for each of the pruned trees using the test set.
  226. pred_undersample <- predict(ptree_undersample, newdata = test_set, type = "class")
  227. pred_prior <- predict(ptree_prior, newdata = test_set, type = "class")
  228. pred_loss_matrix <- predict(ptree_loss_matrix, newdata = test_set, type = "class")
  229. pred_weights <- predict(ptree_weights, newdata = test_set, type = "class")
  230.  
  231. # Construct confusion matrices using the predictions.
  232. confmat_undersample <- table(test_set$loan_status, pred_undersample)
  233. confmat_prior <- table(test_set$loan_status, pred_prior)
  234. confmat_loss_matrix <- table(test_set$loan_status, pred_loss_matrix)
  235. confmat_weights <- table(test_set$loan_status, pred_weights)
  236.  
  237. # Compute the accuracies
  238. acc_undersample <- sum(diag(confmat_undersample)) / nrow(test_set)
  239. acc_prior <- sum(diag(confmat_prior)) / nrow(test_set)
  240. acc_loss_matrix <- sum(diag(confmat_loss_matrix)) / nrow(test_set)
  241. acc_weights <- sum(diag(confmat_weights)) / nrow(test_set)
  242.  
  243. # Make predictions for the probability of default using the pruned tree and the test set.
  244. prob_default_prior <- predict(ptree_prior, newdata = test_set)[ ,2]
  245.  
  246. # Obtain the cutoff for acceptance rate 80%
  247. cutoff_prior <- quantile(prob_default_prior, 0.8)
  248.  
  249. # Obtain the binary predictions.
  250. bin_pred_prior_80 <- ifelse(prob_default_prior > cutoff_prior, 1, 0)
  251.  
  252. # Obtain the actual default status for the accepted loans
  253. accepted_status_prior_80 <- test_set$loan_status[bin_pred_prior_80 == 0]
  254.  
  255. # Obtain the bad rate for the accepted loans
  256. sum(accepted_status_prior_80) / length(accepted_status_prior_80)
  257.  
  258. # Have a look at the function strategy_bank
  259. strategy_bank
  260.  
  261. # Apply the function strategy_bank to both predictions_cloglog and predictions_loss_matrix
  262. strategy_cloglog <- strategy_bank(predictions_cloglog)
  263. strategy_loss_matrix <- strategy_bank(predictions_loss_matrix)
  264.  
  265. # Obtain the strategy tables for both prediction-vectors
  266. strategy_cloglog$table
  267. strategy_loss_matrix$table
  268.  
  269. # Draw the strategy functions
  270. par(mfrow = c(1,2))
  271. plot(strategy_cloglog$accept_rate, strategy_cloglog$bad_rate,
  272. type = "l", xlab = "Acceptance rate", ylab = "Bad rate",
  273. lwd = 2, main = "logistic regression")
  274.  
  275. plot(strategy_loss_matrix$accept_rate, strategy_loss_matrix$bad_rate,
  276. type = "l", xlab = "Acceptance rate",
  277. ylab = "Bad rate", lwd = 2, main = "tree")
  278.  
  279. # Load the pROC-package
  280. library(pROC)
  281. # Construct the objects containing ROC-information
  282. ROC_logit <- roc(test_set$loan_status, predictions_logit)
  283. ROC_probit <-roc(test_set$loan_status,predictions_probit)
  284. ROC_cloglog <-roc(test_set$loan_status,predictions_cloglog)
  285. ROC_all_full <- roc(test_set$loan_status,predictions_all_full)
  286. # Draw all ROCs on one plot
  287. plot(ROC_logit);lines(ROC_probit,col="blue");lines(ROC_cloglog,col="red");lines(ROC_all_full,col="green")
  288. # Compute the AUCs
  289. auc(ROC_logit)
  290. auc(ROC_probit)
  291. auc(ROC_cloglog)
  292. auc(ROC_all_full)
  293.  
  294. # Construct the objects containing ROC-information
  295. ROC_undersample <-roc(test_set$loan_status,predictions_undersample)
  296. ROC_prior <-roc(test_set$loan_status,predictions_prior)
  297. ROC_loss_matrix <-roc(test_set$loan_status,predictions_loss_matrix)
  298. ROC_weights <- roc(test_set$loan_status,predictions_weights)
  299. # Draw the ROC-curves in one plot
  300. plot(ROC_undersample);lines(ROC_prior,col="blue");lines(ROC_loss_matrix,col="red");lines(ROC_weights,col="green")
  301. # Compute the AUCs
  302. auc(ROC_undersample)
  303. auc(ROC_prior)
  304. auc(ROC_loss_matrix)
  305. auc(ROC_weights)
  306.  
  307. # Build four models each time deleting one variable in log_3_remove_ir
  308. log_4_remove_amnt <- glm(loan_status ~ grade + annual_inc + emp_cat,
  309. family = binomial, data = training_set)
  310. log_4_remove_grade <- glm(loan_status ~ loan_amnt + annual_inc + emp_cat,
  311. family = binomial, data = training_set)
  312. log_4_remove_inc <- glm(loan_status ~ loan_amnt + grade + emp_cat ,
  313. family = binomial, data = training_set)
  314. log_4_remove_emp <- glm(loan_status ~ loan_amnt + grade + annual_inc,
  315. family = binomial, data = training_set)
  316.  
  317. # Make PD-predictions for each of the models
  318. pred_4_remove_amnt <- predict(log_4_remove_amnt, newdata = test_set, type = "response")
  319. pred_4_remove_grade <- predict(log_4_remove_grade, newdata = test_set, type = "response")
  320. pred_4_remove_inc <- predict(log_4_remove_inc, newdata = test_set, type = "response")
  321. pred_4_remove_emp <- predict(log_4_remove_emp, newdata = test_set, type = "response")
  322.  
  323. # Compute the AUCs
  324. auc(test_set$loan_status, pred_4_remove_amnt)
  325. auc(test_set$loan_status, pred_4_remove_grade)
  326. auc(test_set$loan_status, pred_4_remove_inc)
  327. auc(test_set$loan_status, pred_4_remove_emp)
  328.  
  329. # Build three models each time deleting one variable in log_4_remove_amnt
  330. log_5_remove_grade <- glm(loan_status ~ annual_inc + emp_cat, family = binomial, data = training_set)
  331. log_5_remove_inc <- glm(loan_status ~ grade + emp_cat , family = binomial, data = training_set)
  332. log_5_remove_emp <- glm(loan_status ~ grade + annual_inc, family = binomial, data = training_set)
  333.  
  334. # Make PD-predictions for each of the models
  335. pred_5_remove_grade <- predict(log_5_remove_grade, newdata = test_set, type = "response")
  336. pred_5_remove_inc <- predict(log_5_remove_inc, newdata = test_set, type = "response")
  337. pred_5_remove_emp <- predict(log_5_remove_emp, newdata = test_set, type = "response")
  338.  
  339. # Compute the AUCs
  340. auc(test_set$loan_status, pred_5_remove_grade)
  341. auc(test_set$loan_status, pred_5_remove_inc)
  342. auc(test_set$loan_status, pred_5_remove_emp)
  343.  
  344. # Plot the ROC-curve for the best model here
  345. plot(roc(test_set$loan_status,pred_4_remove_amnt))
Add Comment
Please, Sign In to add comment