Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #BIS352 Final V2.0 Lift chart Conclusion version
- setwd("C:/Users/Kirin/OneDrive/Documents/Fall 2019/BIS 352/R_Dataset")
- rm(list = ls())
- tayko <- read.csv("TaykoSoftwareCase/Tayko.csv")
- tayko = subset(tayko,select=-c(sequence_number))
- library(forecast)
- #problem 1
- estgrossprofit = 180000*0.053*103-2*180000
- set.seed(12345)
- #problem 2 (a)
- n <- nrow(tayko)
- train.index <- sample(2000,800)
- valid.index <- sample(2000,700)
- test.index <- sample(2000,500)
- train.df <- tayko[train.index,]
- valid.df <- tayko[valid.index,]
- test.df <- tayko[test.index,]
- #logistics regression --Neutral network, classification tree
- set.seed(12345)
- log.lm <- glm(Purchase~.-Spending, data = train.df, family = "binomial")
- log.lm.back <- step(log.lm, direction = "backward")#AIC=615.43
- summary(log.lm.back)
- null <- glm(Purchase~1, data = train.df, family = "binomial")
- log.lm.forward <- step(null,scope = list(lower = null, upper = log.lm), direction = "forward")#613.27
- summary(log.lm.forward)
- log.lm.both <- step(null, scope = list(upper = log.lm), direction = "both")#613.27
- summary(log.lm.both)
- #Prediction and accuracy: back elimation has the highest accuracy
- set.seed(12345)
- log.lm.back.predicted <- predict(log.lm.back, valid.df, type = "response")
- r <- roc(valid.df$Purchase,log.lm.back.predicted)
- plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
- grid.col=c("green", "red"), max.auc.polygon=TRUE,
- auc.polygon.col="skyblue", print.thres="best")#AUC: 0.892
- log.lm.back.pred <- ifelse(predict(log.lm.back, valid.df, type = "response") >= 0.35,1,0)
- confusionMatrix(as.factor(log.lm.back.pred),as.factor(valid.df$Purchase))
- log.lm.forward.predicted <- predict(log.lm.forward, valid.df, type = "response")
- log.lm.forward.pred <- ifelse(predict(log.lm.forward, valid.df, type = "response") >= 0.5,1,0)
- confusionMatrix(as.factor(log.lm.forward.pred),as.factor(valid.df$Purchase))
- log.lm.both.predicted <- predict(log.lm.both, valid.df, type = "response")
- library(pROC)
- r <- roc(valid.df$Purchase,log.lm.both.predicted)
- plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
- grid.col=c("green", "red"), max.auc.polygon=TRUE,
- auc.polygon.col="skyblue", print.thres="best")##AUC = 0.894,cutoff-point 0.385
- log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.385,1,0)
- confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))
- #different cutout points: Highest cutout points for back: 5, forward 4.5
- set.seed(12345)
- accT = c()
- for(cutoff in seq(0, 1, 0.05)) {
- cm <- confusionMatrix(as.factor(ifelse(log.lm.back.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
- accT = c(accT, cm$overall[1])
- }
- accT
- accT.forward = c()
- for(cutoff in seq(0, 1, 0.05)) {
- cm <- confusionMatrix(as.factor(ifelse(log.lm.forward.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
- accT = c(accT.forward, cm$overall[1])
- }
- accT.both
- confusionMatrix(as.factor(ifelse(log.lm.both.predicted > 0.45, 1, 0)), as.factor(valid.df$Purchase))
- accT = c()
- for(cutoff in seq(0, 1, 0.05)) {
- cm <- confusionMatrix(as.factor(ifelse(log.lm.both.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
- accT = c(accT, cm$overall[1])
- }
- log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.5,1,0)#give that highest accuracy:0.7957,and highest sum of specificity and sentivity
- #need to show the plot for different cut-off points
- confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))
- #classification tree: MINERROR TREE: 81.71%, best-pruned tree 79%
- library(rpart)
- library(rpart.plot)
- set.seed(12345)
- cv.ct <- rpart(Purchase~.-Spending, data = train.df,method = "class", control = rpart.control(xval = 10),
- cp = 0.01)
- printcp(cv.ct)
- min.xerror.cp <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"])]
- min.xerror.cp
- min.ct <- prune(cv.ct, cp = 0.01) #or put cp = min.xerror.cop
- prp(min.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
- #once we got the prunning tree, can apply on the validation set
- #confusion matrix after prediction
- min.ct..pred <- predict(min.ct,valid.df,type = "class")
- confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
- 0.38287+0.027949
- 0.0100756
- best.ct <- prune(cv.ct, cp = 0.016373)
- prp(best.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
- confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
- best.ct..pred <- predict(best.ct,valid.df,type = "class")
- confusionMatrix(factor(best.ct..pred),factor(valid.df$Purchase),positive = "1")
- #problem 3 (a) #same for all formular , regression tree
- set.seed(12345)
- tayko.spending = subset(tayko,Purchase==1)
- train.index <- sample(1000,400)
- valid.index <- sample(1000,350)
- test.index <- sample(1000,250)
- train.spending.df <- tayko.spending[train.index,]
- valid.spending.df <- tayko.spending[valid.index,]
- test.spending.df <- tayko.spending[test.index,]
- sp.lm <- lm(Spending~.-Purchase, data = train.spending.df)
- sp.lm
- sp.lm.back <- step(sp.lm,direction = "backward")
- summary(sp.lm.back)
- sp.lm.back.predicted <- predict(sp.lm.back, valid.spending.df)
- accuracy(sp.lm.back.predicted,valid.spending.df$Spending)
- null <- lm(Spending~1, data = train.spending.df)
- sp.lm.forward <- step(null,scope = list(lower = null, upper = sp.lm), direction = "forward")
- summary(sp.lm.forward)
- sp.lm.forward.predicted <- predict(sp.lm.forward,valid.spending.df)
- accuracy(sp.lm.forward.predicted,valid.spending.df$Spending)
- sp.lm.both <- step(null,scope = list(upper = sp.lm), direction = "both")
- sp.lm.both.predicted <- predict(sp.lm.both,valid.spending.df)
- summary(sp.lm.both)
- accuracy(sp.lm.both.predicted,valid.spending.df$Spending)
- ##Regression tree
- set.seed(12345)
- fit <- rpart(Spending ~.-Purchase,data=train.df, cp = 0.01)
- printcp(fit)
- min.xerror <- fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"]
- min.xerror
- #minmum error tree: RMSE 127.6529
- pfit_min = prune(fit,cp=min.xerror)
- prp(pfit_min,type=1,extra=1,split.font=1,varlen=0,
- digits=-1,main="regression tree for Spending")
- min.pred <- predict(pfit_min,valid.df)
- accuracy(min.pred,valid.df$Spending)
- #BEST-PRUNED TREE rmse:138.7743
- 0.43775+0.079485
- 0.05385643
- pfit_best = prune(fit,cp= 0.053856)
- prp(pfit_best,type=1,extra=1,split.font=1,varlen=0,
- digits=-1,main="regression tree for Spending")
- best.pred <- predict(pfit_best,valid.df)
- accuracy(best.pred,valid.df$Spending)
- #problem 4 (a)
- set.seed(12345)
- scoreanalysis <- data.frame()
- test.prob <- predict(log.lm.both,test.df,type = "response")
- test.class = ifelse(test.prob>=0.385,1,0)
- confusionMatrix(as.factor(test.class),as.factor(test.df$Purchase))
- #problem 4 (b)
- test.pred <- predict(pfit_min,test.spending.df)
- accuracy(test.pred,test.spending.df$Spending)
- scoreanalysis=cbind(test.prob,test.pred,test.class)
- scoreanalysis=as.data.frame(scoreanalysis)
- scoreanalysis$test.pred[test.class == 0]=0
- scoreanalysis$adjustedProbility <- test.prob*0.107
- scoreanalysis=as.data.frame(scoreanalysis)
- scoreanalysis$expectedSpending <- scoreanalysis$adjustedProbility * test.pred
- scoreanalysis$expectedSpending[test.class == 0]=0
- scoreanalysis
- library(gains)
- gain <- gains(test.df$Spending, scoreanalysis$expectedSpending, groups=10)
- gain
- data.frame("depth" = gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
- "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
- "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])
- sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
- plot(c(0, gain$cume.pct.of.total*sum(test.df$Spending)) ~ c(0, gain$cume.obs),
- xlab="# cases", ylab="Cumulative", main="", type="l")
- lines(c(0,sum(test.df$Spending))~c(0, dim(valid.df)[1]), lty=2)
- sum(test.df$Spending)
- #plot lift chart
- library(gains)
- gain <- gains(valid.df$Spending, scoreanalysis$expectedSpending, groups=10)
- gain
- data.frame("depth" = gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
- "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
- "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])
- sort <- data.frame("target" = valid.df$Personal.Loan, "response" = round(bank.nn.pred$net.result, 4))[order(bank.nn.pred$net.result, decreasing=TRUE), ]
- sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
- sort[1:200, ]
- # plot lift chart
- plot(c(0, gain$cume.pct.of.total*sum(valid.df$Personal.Loan)) ~ c(0, gain$cume.obs),
- xlab="# cases", ylab="Cumulative", main="", type="l")
- lines(c(0,sum(valid.df$Personal.Loan))~c(0, dim(valid.df)[1]), lty=2)
- #compute deciles and plot decile-wise chart
- heights <- gain$mean.resp/mean(valid.df$Personal.Loan)
- midpoints <- barplot(heights, names.arg = gain$depth, ylim = c(0,9),
- xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement