Untitled

# View the structure of loan_data
str(loan_data)
# Load the gmodels package
library(gmodels)
# Call CrossTable() on loan_status
CrossTable(loan_data$loan_status)
# Call CrossTable() on grade and loan_status
CrossTable(loan_data$grade,loan_data$loan_status,prop.r=T,prop.c=F,prop.t=F,prop.chisq=F)
CrossTable(loan_data$loan_status,loan_data$grade,prop.r=T,prop.c=F,prop.t=F,prop.chisq=F)

# Create histogram of loan_amnt: hist_1
hist_1<-hist(loan_data$loan_amnt)
# Print locations of the breaks in hist_1
hist_1$breaks
# Change number of breaks and add labels: hist_2
hist_2 <- hist(loan_data$loan_amnt, breaks =200, xlab = "Loan amount",main = "Histogram of the loan amount")

# Plot the age variable
plot(loan_data$age, ylab = "Age")

# Save the outlier's index to index_highage
index_highage <- which(loan_data$age > 122)

# Create data set new_data with outlier deleted
new_data <- loan_data[-index_highage, ]

# Make bivariate scatterplot of age and annual income
plot(loan_data$age, loan_data$annual_inc, xlab = "Age", ylab = "Annual Income")

# Look at summary of loan_data
summary(loan_data$int_rate)

# Get indices of missing interest rates: na_index
na_index <- which(is.na(loan_data$int_rate))
# Remove observations with missing interest rates: loan_data_delrow_na
loan_data_delrow_na <- loan_data[-na_index, ]
# Make copy of loan_data
loan_data_delcol_na <- loan_data
# Delete interest rate column from loan_data_delcol_na
loan_data_delcol_na$int_rate <- NULL

# Compute the median of int_rate
median_ir<-median(loan_data$int_rate,na.rm=T)
# Make copy of loan_data
loan_data_replace <- loan_data
# Replace missing interest rates with median
loan_data_replace$int_rate[is.na(loan_data_replace$int_rate)] <-median_ir
# Check if the NAs are gone
summary(loan_data_replace$int_rate)

# Make the necessary replacements in the coarse classification example below
loan_data$ir_cat <- rep(NA, length(loan_data$int_rate))

loan_data$ir_cat[which(loan_data$int_rate <= 8)] <- "0-8"
loan_data$ir_cat[which(loan_data$int_rate > 8 & loan_data$int_rate <= 11)] <- "8-11"
loan_data$ir_cat[which(loan_data$int_rate > 11 & loan_data$int_rate <= 13.5)] <- "11-13.5"
loan_data$ir_cat[which(loan_data$int_rate > 13.5)] <- "13.5+"
loan_data$ir_cat[which(is.na(loan_data$int_rate))] <- "Missing"

loan_data$ir_cat <- as.factor(loan_data$ir_cat)

# Look at your new variable using plot()
plot(loan_data$ir_cat)

# Set seed of 567
set.seed(567)
# Store row numbers for training set: index_train
index_train<-sample(1:nrow(loan_data),round(2/3*nrow(loan_data)))
# Create training set: training_set
training_set <- loan_data[index_train,]
# Create test set: test_set
test_set<-loan_data[-index_train,]

# Create confusion matrix
conf_matrix <- table(test_set$loan_status, model_pred)
# Compute classification accuracy
(6092 + 349) / nrow(test_set)
# Compute sensitivity
349 / 1037

# Build a glm model with variable ir_cat as a predictor
log_model_cat<-glm(loan_status~ir_cat,family="binomial",data=training_set)
# Print the parameter estimates
log_model_cat
# Look at the different categories in ir_cat using table()
table(training_set$ir_cat)
table(loan_data$ir_cat)

# Build the logistic regression model
log_model_multi<-glm(loan_status~age+ir_cat+grade+loan_amnt+annual_inc,family="binomial",data=training_set)
# Obtain significance levels using summary()
summary(log_model_multi)

# Build the logistic regression model
predictions_all_small <- predict(log_model_small, newdata = test_set, type = "response")
# Look at the range of the object "predictions_all_small"
predictions_all_small
range(predictions_all_small)

# Change the code below to construct a logistic regression model using all available predictors in the data set
log_model_full<-glm(loan_status~.,family="binomial",data=training_set)
# Make PD-predictions for all test set elements using the the full logistic regression model
predictions_all_full<-predict(log_model_full,newdata=test_set,type="response")
# Look at the predictions range
range(predictions_all_full)

# The code for the logistic regression model and the predictions is given below
log_model_full <- glm(loan_status ~ ., family = "binomial", data = training_set)
predictions_all_full <- predict(log_model_full, newdata = test_set, type = "response")
# Make a binary predictions-vector using a cut-off of 15%
pred_cutoff_15<-ifelse(predictions_all_full>0.15,1,0)
# Construct a confusion matrix
table(test_set$loan_status,pred_cutoff_15)

# Fit the logit, probit and cloglog-link logistic regression models
log_model_logit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
                       family = binomial(link = logit), data = training_set)
log_model_probit <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
                        family =  binomial(link = probit), data = training_set)
log_model_cloglog <- glm(loan_status ~ age + emp_cat + ir_cat + loan_amnt,
                         family = binomial(link = cloglog), data = training_set)

# Make predictions for all models using the test set
predictions_logit <- predict(log_model_logit, newdata = test_set, type = "response")
predictions_probit <- predict(log_model_probit, newdata = test_set, type = "response")
predictions_cloglog <- predict(log_model_cloglog, newdata = test_set, type = "response")

# Use a cut-off of 14% to make binary predictions-vectors
cutoff <- 0.14
class_pred_logit <- ifelse(predictions_logit > cutoff, 1, 0)
class_pred_probit <- ifelse(predictions_probit > cutoff, 1, 0)
class_pred_cloglog <- ifelse(predictions_cloglog  > cutoff, 1, 0)

# Make a confusion matrix for the three models
tab_class_logit <- table(true_val, class_pred_logit)
tab_class_probit <- table(true_val, class_pred_probit)
tab_class_cloglog <- table(true_val, class_pred_cloglog)

# Compute the classification accuracy for all three models
acc_logit <- sum(diag(tab_class_logit)) / nrow(test_set)
acc_probit <- sum(diag(tab_class_probit)) / nrow(test_set)
acc_cloglog <- sum(diag(tab_class_cloglog)) / nrow(test_set)

# The Gini-measure of the root node is given below
gini_root <- 2 * 89 / 500 * 411 / 500
# Compute the Gini measure for the left leaf node
gini_ll <- 2 * 45 / 446 * 401 / 446
# Compute the Gini measure for the right leaf node
gini_rl <- 2 * 10 / 54 * 44 / 54
# Compute the gain
gain <- gini_root - 446 / 500 * gini_ll - 54 / 500 * gini_rl
gain
# compare the gain-column in small_tree$splits with our computed gain, multiplied by 500, and assure they are the same
small_tree$splits
improve <- gain * 500

# Load package rpart in your workspace.
library(rpart)
# Change the code provided in the video such that a decision tree is constructed using the undersampled training set. Include rpart.control to relax the complexity parameter to 0.001.
tree_undersample <- rpart(loan_status ~ ., method = "class",data =  undersampled_training_set,control = rpart.control(cp = 0.001))
tree_undersample
# Plot the decision tree
plot(tree_undersample, uniform = TRUE)
# Add labels to the decision tree
text(tree_undersample)

# Change the code below such that a tree is constructed with adjusted prior probabilities.
tree_prior <- rpart(loan_status ~ ., method = "class",data = training_set,control=rpart.control(cp=0.001),parms=list(prior=c(0.7,0.3)))
# Plot the decision tree
plot(tree_prior,uniform=T)
# Add labels to the decision tree
text(tree_prior)

# Change the code below such that a decision tree is constructed using a loss matrix penalizing 10 times more heavily for misclassified defaults.
tree_loss_matrix <- rpart(loan_status ~ ., method = "class",data =  training_set,parms=list(loss=matrix(c(0,10,1,0),ncol=2)),control=rpart.control(cp=0.001))
# Plot the decision tree
plot(tree_loss_matrix,uniform=T)
# Add labels to the decision tree
text(tree_loss_matrix)

# Plot the cross-validated error rate as a function of the complexity parameter
plotcp(tree_prior)
# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized
printcp(tree_prior)
# Create an index for of the row with the minimum xerror
index <- which.min(tree_prior$cptable[, "xerror"])
# Create tree_min
tree_min <- tree_prior$cptable[index, "CP"]
#  Prune the tree using tree_min
ptree_prior <- prune(tree_prior, cp = tree_min)
# Use prp() to plot the pruned tree
prp(ptree_prior)

# set a seed and run the code to construct the tree with the loss matrix again
set.seed(345)
tree_loss_matrix  <- rpart(loan_status ~ ., method = "class", data = training_set,parms = list(loss=matrix(c(0, 10, 1, 0), ncol = 2)),control = rpart.control(cp = 0.001))
# Plot the cross-validated error rate as a function of the complexity parameter
printcp(tree_loss_matrix)
cps<-printcp(tree_loss_matrix)[,1]
cps
rpart.plot(tree_loss_matrix)
# Prune the tree using cp = 0.0012788
plotcp(tree_loss_matrix)
ptree_loss_matrix<-prune(tree_loss_matrix,cp=0.0012788)
# Use prp() and argument extra = 1 to plot the pruned tree
prp(ptree_loss_matrix)
rpart.plot(ptree_loss_matrix)
prp(tree_loss_matrix,extra=1)

# set a seed and run the code to obtain a tree using weights, minsplit and minbucket
set.seed(345)
tree_weights <- rpart(loan_status ~ ., method = "class",data = training_set,control = rpart.control(minsplit =5, minbucket =2, cp = 0.001),weight=case_weights)
# Plot the cross-validated error rate for a changing cp
plotcp(tree_weights)
# Create an index for of the row with the minimum xerror
index <- which.min(tree_weights$cptable[ , "xerror"])
# Create tree_min
tree_min <- tree_weights$cp[index, "CP"]
# Prune the tree using tree_min
ptree_weights<-prune(tree_weights,c=tree_min)
# Plot the pruned tree using the rpart.plot()-package
rpart.plot(ptree_weights)
prp(ptree_weights,extra=1)

# Make predictions for each of the pruned trees using the test set.
pred_undersample <- predict(ptree_undersample, newdata = test_set,  type = "class")
pred_prior <- predict(ptree_prior, newdata = test_set, type = "class")
pred_loss_matrix <- predict(ptree_loss_matrix, newdata = test_set, type = "class")
pred_weights <- predict(ptree_weights, newdata = test_set, type = "class")

# Construct confusion matrices using the predictions.
confmat_undersample <- table(test_set$loan_status, pred_undersample)
confmat_prior <- table(test_set$loan_status, pred_prior)
confmat_loss_matrix <- table(test_set$loan_status, pred_loss_matrix)
confmat_weights <- table(test_set$loan_status, pred_weights)

# Compute the accuracies
acc_undersample <- sum(diag(confmat_undersample)) / nrow(test_set)
acc_prior <- sum(diag(confmat_prior)) / nrow(test_set)
acc_loss_matrix <- sum(diag(confmat_loss_matrix)) / nrow(test_set)
acc_weights <- sum(diag(confmat_weights)) / nrow(test_set)

# Make predictions for the probability of default using the pruned tree and the test set.
prob_default_prior <- predict(ptree_prior, newdata = test_set)[ ,2]

# Obtain the cutoff for acceptance rate 80%
cutoff_prior <- quantile(prob_default_prior, 0.8)

# Obtain the binary predictions.
bin_pred_prior_80 <- ifelse(prob_default_prior > cutoff_prior, 1, 0)

# Obtain the actual default status for the accepted loans
accepted_status_prior_80 <- test_set$loan_status[bin_pred_prior_80 == 0]

# Obtain the bad rate for the accepted loans
sum(accepted_status_prior_80) / length(accepted_status_prior_80)

# Have a look at the function strategy_bank
strategy_bank

# Apply the function strategy_bank to both predictions_cloglog and predictions_loss_matrix
strategy_cloglog <- strategy_bank(predictions_cloglog)
strategy_loss_matrix <- strategy_bank(predictions_loss_matrix)

# Obtain the strategy tables for both prediction-vectors
strategy_cloglog$table
strategy_loss_matrix$table

# Draw the strategy functions
par(mfrow = c(1,2))
plot(strategy_cloglog$accept_rate, strategy_cloglog$bad_rate,
     type = "l", xlab = "Acceptance rate", ylab = "Bad rate",
     lwd = 2, main = "logistic regression")

plot(strategy_loss_matrix$accept_rate, strategy_loss_matrix$bad_rate,
     type = "l", xlab = "Acceptance rate",
     ylab = "Bad rate", lwd = 2, main = "tree")

# Load the pROC-package
library(pROC)
# Construct the objects containing ROC-information
ROC_logit <- roc(test_set$loan_status, predictions_logit)
ROC_probit <-roc(test_set$loan_status,predictions_probit)
ROC_cloglog <-roc(test_set$loan_status,predictions_cloglog)
ROC_all_full <- roc(test_set$loan_status,predictions_all_full)
# Draw all ROCs on one plot
plot(ROC_logit);lines(ROC_probit,col="blue");lines(ROC_cloglog,col="red");lines(ROC_all_full,col="green")
# Compute the AUCs
auc(ROC_logit)
auc(ROC_probit)
auc(ROC_cloglog)
auc(ROC_all_full)

# Construct the objects containing ROC-information
ROC_undersample <-roc(test_set$loan_status,predictions_undersample)
ROC_prior <-roc(test_set$loan_status,predictions_prior)
ROC_loss_matrix <-roc(test_set$loan_status,predictions_loss_matrix)
ROC_weights <- roc(test_set$loan_status,predictions_weights)
# Draw the ROC-curves in one plot
plot(ROC_undersample);lines(ROC_prior,col="blue");lines(ROC_loss_matrix,col="red");lines(ROC_weights,col="green")
# Compute the AUCs
auc(ROC_undersample)
auc(ROC_prior)
auc(ROC_loss_matrix)
auc(ROC_weights)

# Build four models each time deleting one variable in log_3_remove_ir
log_4_remove_amnt <- glm(loan_status ~ grade + annual_inc + emp_cat,
                         family = binomial, data = training_set)
log_4_remove_grade <- glm(loan_status ~ loan_amnt  + annual_inc + emp_cat,
                          family = binomial, data = training_set)
log_4_remove_inc <- glm(loan_status ~ loan_amnt + grade  + emp_cat ,
                        family = binomial, data = training_set)
log_4_remove_emp <- glm(loan_status ~ loan_amnt + grade + annual_inc,
                        family = binomial, data = training_set)

# Make PD-predictions for each of the models
pred_4_remove_amnt <- predict(log_4_remove_amnt, newdata = test_set, type = "response")
pred_4_remove_grade <- predict(log_4_remove_grade, newdata = test_set, type = "response")
pred_4_remove_inc <- predict(log_4_remove_inc, newdata = test_set, type = "response")
pred_4_remove_emp <- predict(log_4_remove_emp, newdata = test_set, type = "response")

# Compute the AUCs
auc(test_set$loan_status, pred_4_remove_amnt)
auc(test_set$loan_status, pred_4_remove_grade)
auc(test_set$loan_status, pred_4_remove_inc)
auc(test_set$loan_status, pred_4_remove_emp)

# Build three models each time deleting one variable in log_4_remove_amnt
log_5_remove_grade <- glm(loan_status ~ annual_inc + emp_cat, family = binomial, data = training_set)
log_5_remove_inc <- glm(loan_status ~ grade  + emp_cat , family = binomial, data = training_set)
log_5_remove_emp <- glm(loan_status ~ grade + annual_inc, family = binomial, data = training_set)

# Make PD-predictions for each of the models
pred_5_remove_grade <- predict(log_5_remove_grade, newdata = test_set, type = "response")
pred_5_remove_inc <- predict(log_5_remove_inc, newdata = test_set, type = "response")
pred_5_remove_emp <- predict(log_5_remove_emp, newdata = test_set, type = "response")

# Compute the AUCs
auc(test_set$loan_status, pred_5_remove_grade)
auc(test_set$loan_status, pred_5_remove_inc)
auc(test_set$loan_status, pred_5_remove_emp)

# Plot the ROC-curve for the best model here
plot(roc(test_set$loan_status,pred_4_remove_amnt))