Untitled

#BIS352 Final V2.0 Lift chart Conclusion version


setwd("C:/Users/Kirin/OneDrive/Documents/Fall 2019/BIS 352/R_Dataset")
rm(list = ls())
tayko <- read.csv("TaykoSoftwareCase/Tayko.csv")
tayko = subset(tayko,select=-c(sequence_number))

library(forecast)

#problem 1
estgrossprofit = 180000*0.053*103-2*180000
set.seed(12345)
#problem 2 (a)
n <- nrow(tayko)
train.index <- sample(2000,800)
valid.index <- sample(2000,700)
test.index <- sample(2000,500)
train.df <- tayko[train.index,]
valid.df <- tayko[valid.index,]
test.df <- tayko[test.index,]
#logistics regression --Neutral network, classification tree
set.seed(12345)
log.lm <- glm(Purchase~.-Spending, data = train.df, family = "binomial")
log.lm.back <- step(log.lm, direction = "backward")#AIC=615.43

summary(log.lm.back)
null <- glm(Purchase~1, data = train.df, family = "binomial")
log.lm.forward <- step(null,scope = list(lower = null, upper = log.lm), direction = "forward")#613.27
summary(log.lm.forward)
log.lm.both <- step(null, scope = list(upper = log.lm), direction = "both")#613.27
summary(log.lm.both)
#Prediction and accuracy: back elimation has the highest accuracy
set.seed(12345)
log.lm.back.predicted <- predict(log.lm.back, valid.df, type = "response")
r <- roc(valid.df$Purchase,log.lm.back.predicted)
plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col="skyblue", print.thres="best")#AUC: 0.892
log.lm.back.pred <- ifelse(predict(log.lm.back, valid.df, type = "response") >= 0.35,1,0)
confusionMatrix(as.factor(log.lm.back.pred),as.factor(valid.df$Purchase))
log.lm.forward.predicted <- predict(log.lm.forward, valid.df, type = "response")
log.lm.forward.pred <- ifelse(predict(log.lm.forward, valid.df, type = "response") >= 0.5,1,0)
confusionMatrix(as.factor(log.lm.forward.pred),as.factor(valid.df$Purchase))
log.lm.both.predicted <- predict(log.lm.both, valid.df, type = "response")
library(pROC)
r <- roc(valid.df$Purchase,log.lm.both.predicted)
plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col="skyblue", print.thres="best")##AUC = 0.894,cutoff-point 0.385
log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.385,1,0)
confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))
#different cutout points: Highest cutout points for back: 5, forward 4.5
set.seed(12345)
accT = c()
for(cutoff in seq(0, 1, 0.05)) {
  cm <- confusionMatrix(as.factor(ifelse(log.lm.back.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  accT = c(accT, cm$overall[1])
}
accT
accT.forward = c()
for(cutoff in seq(0, 1, 0.05)) {
  cm <- confusionMatrix(as.factor(ifelse(log.lm.forward.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  accT = c(accT.forward, cm$overall[1])
}
accT.both
confusionMatrix(as.factor(ifelse(log.lm.both.predicted > 0.45, 1, 0)), as.factor(valid.df$Purchase))
accT = c()
for(cutoff in seq(0, 1, 0.05)) {
  cm <- confusionMatrix(as.factor(ifelse(log.lm.both.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  accT = c(accT, cm$overall[1])
}
log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.5,1,0)#give that highest accuracy:0.7957,and highest sum of specificity and sentivity
#need to show the plot for different cut-off points
confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))

#classification tree: MINERROR TREE: 81.71%, best-pruned tree 79%
library(rpart)
library(rpart.plot)
set.seed(12345)
cv.ct <- rpart(Purchase~.-Spending, data = train.df,method = "class", control = rpart.control(xval = 10),
               cp = 0.01)
printcp(cv.ct)
min.xerror.cp <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"])]
min.xerror.cp
min.ct <- prune(cv.ct, cp = 0.01) #or put cp = min.xerror.cop
prp(min.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
#once we got the prunning tree, can apply on the validation set
#confusion matrix after prediction
min.ct..pred <- predict(min.ct,valid.df,type = "class")
confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
0.38287+0.027949
0.0100756
best.ct <- prune(cv.ct, cp = 0.016373)
prp(best.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
best.ct..pred <- predict(best.ct,valid.df,type = "class")
confusionMatrix(factor(best.ct..pred),factor(valid.df$Purchase),positive = "1")

#problem 3 (a) #same for all formular , regression tree
set.seed(12345)
tayko.spending = subset(tayko,Purchase==1)
train.index <- sample(1000,400)
valid.index <- sample(1000,350)
test.index <- sample(1000,250)
train.spending.df <- tayko.spending[train.index,]
valid.spending.df <- tayko.spending[valid.index,]
test.spending.df <- tayko.spending[test.index,]
sp.lm <- lm(Spending~.-Purchase, data = train.spending.df)
sp.lm
sp.lm.back <- step(sp.lm,direction = "backward")
summary(sp.lm.back)
sp.lm.back.predicted <- predict(sp.lm.back, valid.spending.df)
accuracy(sp.lm.back.predicted,valid.spending.df$Spending)
null <- lm(Spending~1, data = train.spending.df)
sp.lm.forward <- step(null,scope = list(lower = null, upper = sp.lm), direction = "forward")
summary(sp.lm.forward)
sp.lm.forward.predicted <- predict(sp.lm.forward,valid.spending.df)
accuracy(sp.lm.forward.predicted,valid.spending.df$Spending)
sp.lm.both <- step(null,scope = list(upper = sp.lm), direction = "both")
sp.lm.both.predicted <- predict(sp.lm.both,valid.spending.df)
summary(sp.lm.both)
accuracy(sp.lm.both.predicted,valid.spending.df$Spending)
##Regression tree
set.seed(12345)
fit <- rpart(Spending ~.-Purchase,data=train.df, cp = 0.01)
printcp(fit)
min.xerror <- fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"]
min.xerror
#minmum error tree:     RMSE 127.6529
pfit_min = prune(fit,cp=min.xerror)
prp(pfit_min,type=1,extra=1,split.font=1,varlen=0,
    digits=-1,main="regression tree for Spending")
min.pred <- predict(pfit_min,valid.df)
accuracy(min.pred,valid.df$Spending)
#BEST-PRUNED TREE rmse:138.7743
0.43775+0.079485
0.05385643
pfit_best = prune(fit,cp= 0.053856)
prp(pfit_best,type=1,extra=1,split.font=1,varlen=0,
    digits=-1,main="regression tree for Spending")
best.pred <- predict(pfit_best,valid.df)
accuracy(best.pred,valid.df$Spending)

#problem 4 (a)
set.seed(12345)
scoreanalysis <- data.frame()
test.prob <-  predict(log.lm.both,test.df,type = "response")
test.class = ifelse(test.prob>=0.385,1,0)
confusionMatrix(as.factor(test.class),as.factor(test.df$Purchase))

#problem 4 (b)
test.pred <- predict(pfit_min,test.spending.df)
accuracy(test.pred,test.spending.df$Spending)
scoreanalysis=cbind(test.prob,test.pred,test.class)
scoreanalysis=as.data.frame(scoreanalysis)
scoreanalysis$test.pred[test.class == 0]=0
scoreanalysis$adjustedProbility <- test.prob*0.107
scoreanalysis=as.data.frame(scoreanalysis)
scoreanalysis$expectedSpending <- scoreanalysis$adjustedProbility * test.pred
scoreanalysis$expectedSpending[test.class == 0]=0
scoreanalysis
library(gains)
gain <- gains(test.df$Spending, scoreanalysis$expectedSpending, groups=10)
gain

data.frame("depth" =  gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
           "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
           "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])

sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
plot(c(0, gain$cume.pct.of.total*sum(test.df$Spending)) ~ c(0, gain$cume.obs),
     xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(test.df$Spending))~c(0, dim(valid.df)[1]), lty=2)
sum(test.df$Spending)


#plot lift chart
library(gains)
gain <- gains(valid.df$Spending, scoreanalysis$expectedSpending, groups=10)
gain

data.frame("depth" =  gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
           "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
           "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])

sort <- data.frame("target" = valid.df$Personal.Loan, "response" = round(bank.nn.pred$net.result, 4))[order(bank.nn.pred$net.result, decreasing=TRUE), ]
sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
sort[1:200, ]

# plot lift chart
plot(c(0, gain$cume.pct.of.total*sum(valid.df$Personal.Loan)) ~ c(0, gain$cume.obs),
     xlab="# cases", ylab="Cumulative", main="", type="l")
lines(c(0,sum(valid.df$Personal.Loan))~c(0, dim(valid.df)[1]), lty=2)

#compute deciles and plot decile-wise chart
heights <- gain$mean.resp/mean(valid.df$Personal.Loan)
midpoints <- barplot(heights, names.arg = gain$depth, ylim = c(0,9),
                     xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")