Advertisement
Guest User

Untitled

a guest
Dec 5th, 2019
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 9.19 KB | None | 0 0
  1. #BIS352 Final V2.0 Lift chart Conclusion version
  2.  
  3.  
  4.  
  5. setwd("C:/Users/Kirin/OneDrive/Documents/Fall 2019/BIS 352/R_Dataset")
  6. rm(list = ls())
  7. tayko <- read.csv("TaykoSoftwareCase/Tayko.csv")
  8. tayko = subset(tayko,select=-c(sequence_number))
  9.  
  10. library(forecast)
  11.  
  12. #problem 1
  13. estgrossprofit = 180000*0.053*103-2*180000
  14. set.seed(12345)
  15. #problem 2 (a)
  16. n <- nrow(tayko)
  17. train.index <- sample(2000,800)
  18. valid.index <- sample(2000,700)
  19. test.index <- sample(2000,500)
  20. train.df <- tayko[train.index,]
  21. valid.df <- tayko[valid.index,]
  22. test.df <- tayko[test.index,]
  23. #logistics regression --Neutral network, classification tree
  24. set.seed(12345)
  25. log.lm <- glm(Purchase~.-Spending, data = train.df, family = "binomial")
  26. log.lm.back <- step(log.lm, direction = "backward")#AIC=615.43
  27.  
  28. summary(log.lm.back)
  29. null <- glm(Purchase~1, data = train.df, family = "binomial")
  30. log.lm.forward <- step(null,scope = list(lower = null, upper = log.lm), direction = "forward")#613.27
  31. summary(log.lm.forward)
  32. log.lm.both <- step(null, scope = list(upper = log.lm), direction = "both")#613.27
  33. summary(log.lm.both)
  34. #Prediction and accuracy: back elimation has the highest accuracy
  35. set.seed(12345)
  36. log.lm.back.predicted <- predict(log.lm.back, valid.df, type = "response")
  37. r <- roc(valid.df$Purchase,log.lm.back.predicted)
  38. plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
  39.      grid.col=c("green", "red"), max.auc.polygon=TRUE,
  40.      auc.polygon.col="skyblue", print.thres="best")#AUC: 0.892
  41. log.lm.back.pred <- ifelse(predict(log.lm.back, valid.df, type = "response") >= 0.35,1,0)
  42. confusionMatrix(as.factor(log.lm.back.pred),as.factor(valid.df$Purchase))
  43. log.lm.forward.predicted <- predict(log.lm.forward, valid.df, type = "response")
  44. log.lm.forward.pred <- ifelse(predict(log.lm.forward, valid.df, type = "response") >= 0.5,1,0)
  45. confusionMatrix(as.factor(log.lm.forward.pred),as.factor(valid.df$Purchase))
  46. log.lm.both.predicted <- predict(log.lm.both, valid.df, type = "response")
  47. library(pROC)
  48. r <- roc(valid.df$Purchase,log.lm.both.predicted)
  49. plot(r, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
  50.      grid.col=c("green", "red"), max.auc.polygon=TRUE,
  51.      auc.polygon.col="skyblue", print.thres="best")##AUC = 0.894,cutoff-point 0.385
  52. log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.385,1,0)
  53. confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))
  54. #different cutout points: Highest cutout points for back: 5, forward 4.5
  55. set.seed(12345)
  56. accT = c()
  57. for(cutoff in seq(0, 1, 0.05)) {
  58.   cm <- confusionMatrix(as.factor(ifelse(log.lm.back.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  59.   accT = c(accT, cm$overall[1])
  60. }
  61. accT
  62. accT.forward = c()
  63. for(cutoff in seq(0, 1, 0.05)) {
  64.   cm <- confusionMatrix(as.factor(ifelse(log.lm.forward.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  65.   accT = c(accT.forward, cm$overall[1])
  66. }
  67. accT.both
  68. confusionMatrix(as.factor(ifelse(log.lm.both.predicted > 0.45, 1, 0)), as.factor(valid.df$Purchase))
  69. accT = c()
  70. for(cutoff in seq(0, 1, 0.05)) {
  71.   cm <- confusionMatrix(as.factor(ifelse(log.lm.both.predicted > cutoff, 1, 0)), as.factor(valid.df$Purchase))
  72.   accT = c(accT, cm$overall[1])
  73. }
  74. log.lm.both.pred <- ifelse(predict(log.lm.both, valid.df, type = "response") >= 0.5,1,0)#give that highest accuracy:0.7957,and highest sum of specificity and sentivity
  75. #need to show the plot for different cut-off points
  76. confusionMatrix(as.factor(log.lm.both.pred),as.factor(valid.df$Purchase))
  77.  
  78. #classification tree: MINERROR TREE: 81.71%, best-pruned tree 79%
  79. library(rpart)
  80. library(rpart.plot)
  81. set.seed(12345)
  82. cv.ct <- rpart(Purchase~.-Spending, data = train.df,method = "class", control = rpart.control(xval = 10),
  83.                cp = 0.01)
  84. printcp(cv.ct)
  85. min.xerror.cp <- cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"])]
  86. min.xerror.cp
  87. min.ct <- prune(cv.ct, cp = 0.01) #or put cp = min.xerror.cop
  88. prp(min.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
  89. #once we got the prunning tree, can apply on the validation set
  90. #confusion matrix after prediction
  91. min.ct..pred <- predict(min.ct,valid.df,type = "class")
  92. confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
  93. 0.38287+0.027949
  94. 0.0100756
  95. best.ct <- prune(cv.ct, cp = 0.016373)
  96. prp(best.ct,type = 1, extra = 2, split.font = 1, varlen = -10, under = TRUE)
  97. confusionMatrix(factor(min.ct..pred),factor(valid.df$Purchase),positive = "1")
  98. best.ct..pred <- predict(best.ct,valid.df,type = "class")
  99. confusionMatrix(factor(best.ct..pred),factor(valid.df$Purchase),positive = "1")
  100.  
  101. #problem 3 (a) #same for all formular , regression tree
  102. set.seed(12345)
  103. tayko.spending = subset(tayko,Purchase==1)
  104. train.index <- sample(1000,400)
  105. valid.index <- sample(1000,350)
  106. test.index <- sample(1000,250)
  107. train.spending.df <- tayko.spending[train.index,]
  108. valid.spending.df <- tayko.spending[valid.index,]
  109. test.spending.df <- tayko.spending[test.index,]
  110. sp.lm <- lm(Spending~.-Purchase, data = train.spending.df)
  111. sp.lm
  112. sp.lm.back <- step(sp.lm,direction = "backward")
  113. summary(sp.lm.back)
  114. sp.lm.back.predicted <- predict(sp.lm.back, valid.spending.df)
  115. accuracy(sp.lm.back.predicted,valid.spending.df$Spending)
  116. null <- lm(Spending~1, data = train.spending.df)
  117. sp.lm.forward <- step(null,scope = list(lower = null, upper = sp.lm), direction = "forward")
  118. summary(sp.lm.forward)
  119. sp.lm.forward.predicted <- predict(sp.lm.forward,valid.spending.df)
  120. accuracy(sp.lm.forward.predicted,valid.spending.df$Spending)
  121. sp.lm.both <- step(null,scope = list(upper = sp.lm), direction = "both")
  122. sp.lm.both.predicted <- predict(sp.lm.both,valid.spending.df)
  123. summary(sp.lm.both)
  124. accuracy(sp.lm.both.predicted,valid.spending.df$Spending)
  125. ##Regression tree
  126. set.seed(12345)
  127. fit <- rpart(Spending ~.-Purchase,data=train.df, cp = 0.01)
  128. printcp(fit)
  129. min.xerror <- fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"]
  130. min.xerror
  131. #minmum error tree:     RMSE 127.6529
  132. pfit_min = prune(fit,cp=min.xerror)
  133. prp(pfit_min,type=1,extra=1,split.font=1,varlen=0,
  134.     digits=-1,main="regression tree for Spending")
  135. min.pred <- predict(pfit_min,valid.df)
  136. accuracy(min.pred,valid.df$Spending)
  137. #BEST-PRUNED TREE rmse:138.7743
  138. 0.43775+0.079485
  139. 0.05385643
  140. pfit_best = prune(fit,cp= 0.053856)
  141. prp(pfit_best,type=1,extra=1,split.font=1,varlen=0,
  142.     digits=-1,main="regression tree for Spending")
  143. best.pred <- predict(pfit_best,valid.df)
  144. accuracy(best.pred,valid.df$Spending)
  145.  
  146. #problem 4 (a)
  147. set.seed(12345)
  148. scoreanalysis <- data.frame()
  149. test.prob <-  predict(log.lm.both,test.df,type = "response")
  150. test.class = ifelse(test.prob>=0.385,1,0)
  151. confusionMatrix(as.factor(test.class),as.factor(test.df$Purchase))
  152.  
  153. #problem 4 (b)
  154. test.pred <- predict(pfit_min,test.spending.df)
  155. accuracy(test.pred,test.spending.df$Spending)
  156. scoreanalysis=cbind(test.prob,test.pred,test.class)
  157. scoreanalysis=as.data.frame(scoreanalysis)
  158. scoreanalysis$test.pred[test.class == 0]=0
  159. scoreanalysis$adjustedProbility <- test.prob*0.107
  160. scoreanalysis=as.data.frame(scoreanalysis)
  161. scoreanalysis$expectedSpending <- scoreanalysis$adjustedProbility * test.pred
  162. scoreanalysis$expectedSpending[test.class == 0]=0
  163. scoreanalysis
  164. library(gains)
  165. gain <- gains(test.df$Spending, scoreanalysis$expectedSpending, groups=10)
  166. gain
  167.  
  168. data.frame("depth" =  gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
  169.            "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
  170.            "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])
  171.  
  172. sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
  173. plot(c(0, gain$cume.pct.of.total*sum(test.df$Spending)) ~ c(0, gain$cume.obs),
  174.      xlab="# cases", ylab="Cumulative", main="", type="l")
  175. lines(c(0,sum(test.df$Spending))~c(0, dim(valid.df)[1]), lty=2)
  176. sum(test.df$Spending)
  177.  
  178.  
  179.  
  180.  
  181. #plot lift chart
  182. library(gains)
  183. gain <- gains(valid.df$Spending, scoreanalysis$expectedSpending, groups=10)
  184. gain
  185.  
  186. data.frame("depth" =  gain["depth"], "obs" = gain["obs"], "cume.obs" = gain["cume.obs"],
  187.            "mean.resp" = gain["mean.resp"], "cume.mean.resp" = gain["cume.mean.resp"], "cume.pct.of.total" = gain["cume.pct.of.total"],
  188.            "lift" = gain["lift"], "cume.lift" = gain["cume.lift"], "mean.prediction" = gain["mean.prediction"])
  189.  
  190. sort <- data.frame("target" = valid.df$Personal.Loan, "response" = round(bank.nn.pred$net.result, 4))[order(bank.nn.pred$net.result, decreasing=TRUE), ]
  191. sort <- data.frame("target" = test.df$Spending, "response" = round(scoreanalysis$expectedSpending, 4))[order(scoreanalysis$expectedSpending, decreasing=TRUE), ]
  192. sort[1:200, ]
  193.  
  194. # plot lift chart
  195. plot(c(0, gain$cume.pct.of.total*sum(valid.df$Personal.Loan)) ~ c(0, gain$cume.obs),
  196.      xlab="# cases", ylab="Cumulative", main="", type="l")
  197. lines(c(0,sum(valid.df$Personal.Loan))~c(0, dim(valid.df)[1]), lty=2)
  198.  
  199. #compute deciles and plot decile-wise chart
  200. heights <- gain$mean.resp/mean(valid.df$Personal.Loan)
  201. midpoints <- barplot(heights, names.arg = gain$depth, ylim = c(0,9),
  202.                      xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement