Advertisement
Guest User

Untitled

a guest
Mar 11th, 2016
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.97 KB | None | 0 0
  1. install.packages("RPostgreSQL")
  2. library("RPostgreSQL")
  3. drv = dbDriver("PostgreSQL")
  4. conn = dbConnect(drv,host="cloud2go-redshift-invoice2go.cvijiumx2xhk.us-west-2.redshift.amazonaws.com",port="5439",dbname="cloud2go",user="vishal",password="4L23BYpiajNbR9Dn8m48TdV3")
  5.  
  6. #insert query into variable churn
  7. #churn = dbGetQuery(conn,"select...")
  8.  
  9. #this set converts particular columns to a particular type
  10. #if you add a column to the original query, make sure to check that the below code adjusts the right columns
  11. churn[c(12,13,14)] = sapply(churn[c(12,13,14)],as.logical)
  12. churn[c(2,3,5,6,8,9,10,15,16,17,18,19)] = sapply(churn[c(2,3,5,6,8,9,10,15,16,17,18,19)],as.factor)
  13. churn_matrix = as.matrix(churn)
  14. churn_df = as.data.frame(churn_matrix[,-1])
  15.  
  16. #data splitting function for 75% training vs 25% test data
  17. splitdf = function(dataframe,seed=NULL) {
  18. if (!is.null(seed)) set.seed(seed)
  19. index = 1:nrow(dataframe)
  20. trainIndex = sample(index,trunc(length(index)*.75))
  21. trainSet = dataframe[trainIndex, ]
  22. testSet = dataframe[-trainIndex, ]
  23. list(trainSet=trainSet,testSet=testSet)
  24. }
  25. #apply function to split dataframe
  26. splits <- splitdf(churn_df, seed=808)
  27. lapply(splits, nrow)
  28. lapply(splits,head)
  29. training = splits$trainSet
  30. testing = splits$testSet
  31.  
  32. #which column do we want as the outcome
  33. outcome = 'churned'
  34. #what is the positive outcome of this column
  35. #positive will be 0 since we're looking at 1's as churned vs. 0's as retained
  36. pos = 'FALSE'
  37.  
  38. #enter in any categorical variable from the dataset to see the breakout to the outcome variable (churn)
  39. table218 = table(
  40. categorical_variable=training[,'doc_recency'],
  41. churn = training[,outcome],
  42. useNA= 'ifany')
  43. print(table218)
  44.  
  45. print(table218[,2]/(table218[,1]+table218[,2]))
  46.  
  47. #testing data for above
  48. table218 = table(
  49. categorical_variable=testing[,'doc_recency'],
  50. churn = testing[,outcome],
  51. useNA= 'ifany')
  52. print(table218)
  53.  
  54. print(table218[,2]/(table218[,1]+table218[,2]))
  55.  
  56.  
  57.  
  58. #function to predict a variable as the outcome of churn
  59. mkPredC = function(outCol,varCol,appCol) {
  60. pPos = sum(outCol==pos)/length(outCol)
  61. naTab = table(as.factor(outCol[is.na(varCol)]))
  62. pPosWna = (naTab/sum(naTab))[pos]
  63. vTab = table(as.factor(outCol),varCol)
  64. pPosWv = (vTab[pos, ]+1.0e-3*pPos)/(colSums(vTab)+1.0e-3)
  65. pred = pPosWv[appCol]
  66. pred[is.na(appCol)] = pPosWna
  67. pred[is.na(pred)] = pPos
  68. pred
  69. }
  70.  
  71. #set the categorical variables
  72. catVars = c(training$platform_created_on,training$country,training$month_of_registered,training$month_of_purchased,training$days_to_first_purchase,training$first_productid,training$first_purchased_via,training$total_devices,training$clientlist,training$productlist,training$doc_count,training$doc_recency,training$email_domain,training$industry,training$invoice_bucket)
  73.  
  74. #looping to do multiple categorical variables at once
  75. #takes a while to run ~30 mins
  76. for(v in catVars) {
  77. pi = paste('pred',v,sep='')
  78. training[,pi] = mkPredC(training[,outcome],training[,v],training[,v])
  79. testing[,pi] = mkPredC(training[,outcome],training[,v],testing[,v])
  80. }
  81.  
  82. #68% of users who have a client list did not churn
  83. #98.9% of users who created a doc within 30 days of expiring did not churn, 77% of users who created a doc within 60 days of expiring did not churn, 18% of users who created a doc >60 days did not churn
  84. #50% of users with only 1 device do not churn, 73% of users with 2 devices do not churn, 87% of users with 3 or more devices do not churn
  85.  
  86.  
  87. #clustering and forests
  88. library(rpart)
  89. library(party)
  90. library(randomForest)
  91. library(rattle)
  92. library(ggplot2)
  93.  
  94. #RANDOM FORESTS
  95. ##need to bucket doc_count as forest wont plot more than 53 buckets
  96. #took out industry from random forest for now
  97. fit_forest = randomForest(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training, importance = TRUE, keep.forest=TRUE)
  98. importance(fit_forest)
  99. varImpPlot(fit_forest)
  100. #generates importance of each variable, higher gini the more predictive
  101.  
  102. #END RANDOM FORESTS
  103.  
  104. #TREE BUILDING
  105. #using party package
  106. fit_party = ctree(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training)
  107. plot(fit_party)
  108. #simplify above ctree
  109. fit_party2 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = training)
  110. fit_party3 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = vartesting)
  111.  
  112.  
  113.  
  114. #write a plot to a file for easier viewing, can adjust height and width
  115. png("tree13_mar14_may14_w_4_variables_inv_bucket_testing", res=80, height=1800, width=3600)
  116. plot(fit_party3, gp = gpar(fontsize = 4), # font size changed to 6
  117. inner_panel=node_inner,
  118. ip_args=list(
  119. abbreviate = FALSE,
  120. id = FALSE)
  121. )
  122. dev.off()
  123.  
  124. #END TREE BUILDING
  125.  
  126.  
  127. #RPART
  128. fit_rpart = rpart(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, method = "class",data =training)
  129. #can use prp instead of plot
  130. plot(fit_rpart,uniform = TRUE)
  131. text(fit_rpart,use.n=TRUE,all=TRUE,cex=.8)
  132. fancyRpartPlot(fit_rpart)
  133.  
  134. #rpart prediction
  135. library(rpart)
  136. fV = paste(outcome,'churned ~ ',
  137. paste(c(catVars,numericVars),collapse = ' + '),sep='')
  138. tmodel = rpart(fV,data=training)
  139. print(calcAUC(predict(tmodel,newdata=testing),testing[,outcome]))
  140.  
  141. #END RPART
  142.  
  143. #GOWER MATRIX DISTANCES
  144.  
  145. #gower distance of categorical variables for clustering
  146. #GOWER distance will take a really long time to run
  147. library(cluster)
  148. d= daisy(training,metric= "gower",stand=FALSE)
  149. #use gower distance to create 5 clusters
  150. clusters = pam(x=d,k=5,diss=TRUE)
  151. hclustfunc <- function(x) hclust(x, method="complete")
  152. gower = hclustfunc(d)
  153.  
  154. #END GOWER
  155.  
  156. #LOGISTIC REGRESSION
  157.  
  158. #set the logistic model
  159. #make sure family is binomial since our outcome is T/F
  160. #link = logit links the output to a linear model
  161. model = glm(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + doc_count + doc_recency + email_domain + invoice_bucket, data =training,family = binomial(link="logit"))
  162. training$pred = predict(model,newdata=training, type ="response")
  163. testing$pred = predict(model,newdata=testing,type ="response")
  164. #now we need to plot the scores of the negative and positive instances. we want high scores for positive instances and low scores otherwise
  165. library(ggplot2)
  166. ggplot(training, aes(x=pred, color= churned, linetype = churned)) +geom_density()
  167. #look for estimates where Pr(>z) is less than 0.05. these are strong negative/positive indicators
  168. #exp(estimate) will be the odds of churn =TRUE (if estimate is negative, churn = FALSE if estimate is positive) compared to retained with all other variables equal
  169. summary(model)
  170.  
  171. #calculating deviance from summary above
  172. loglikelihood = function(y,py) {
  173. sum(y * log(py) + (1-y) * log(1-py))
  174. }
  175.  
  176. pnull = mean(as.numeric(training$churned))
  177. null.dev =-2*loglikelihood(as.numeric(training$churned),pnull)
  178. model$null.deviance
  179. pred = predict(model,newdata = training, type = "response")
  180. resid.dev = -2*loglikelihood(as.numeric(training$churned),pred)
  181. resid.dev
  182. model$deviance
  183. testy = as.numeric(training$churned)
  184. testpred = predict(model,newdata=testing, type = "response")
  185. pnull.test = mean(testy)
  186. null.dev.test = -2*loglikelihood(testy, pnull.test)
  187. resid.dev.test = -2*loglikelihood(testy,testpred)
  188. pnull.test
  189. null.dev.test
  190. resid.dev.test
  191.  
  192. #calculate significance of observed fit
  193. #degrees of freedom
  194. df.null = dim(training)[[1]]-1
  195. df.model = dim(training)[[1]] - length(model$coefficients)
  196. df.null
  197. df.model
  198. deldev = null.dev - resid.dev
  199. deldf = df.null-df.model
  200. p = pchisq(deldev,deldf,lower.tail = F)
  201. deldev
  202. deldf
  203. p
  204.  
  205. #exploring log reg model trade offs (precision vs. recall)
  206. library(ROCR)
  207. library(grid)
  208. predObj = prediction(training$pred,training$churned)
  209. precObj = performance(predObj,measure = "prec")
  210. recObj = performance(predObj,measure = "rec")
  211. precision = (precObj@y.values)[[1]]
  212. prec.x = (precObj@x.values)[[1]]
  213. recall = (recObj@y.values)[[1]]
  214.  
  215. rocFrame = data.frame(threshold=prec.x, precision = precision, recall = recall)
  216. #function to plot multiple plots on 1 page (stacked)
  217. nplot = function(plist) {
  218. n= length(plist)
  219. grid.newpage()
  220. pushViewport(viewport(layout = grid.layout(n,1)))
  221. vplayout = function(x,y) {viewport(layout.pos.row = x, layout.pos.col = y)}
  222. for(i in 1:n) {
  223. print(plist[[i]], vp=vplayout(i,1))
  224. }
  225. }
  226.  
  227.  
  228. pnull = mean(as.numeric(training$churned))
  229. #change xlim ylim based on characteristic
  230. #plots recall vs. precision
  231. p1 = ggplot(rocFrame, aes(x=threshold)) + geom_line(aes(y=precision/pnull)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
  232. p2 = ggplot(rocFrame, aes(x=threshold))+geom_line(aes(y=recall)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
  233. nplot(list(p1,p2))
  234.  
  235. ctab.test = table(pred=testing$pred >0.5, churned = testing$churned)
  236.  
  237.  
  238.  
  239.  
  240.  
  241.  
  242. #NEED TO FIX BELOW CODE, CURRENTLY NOT WORKING
  243.  
  244. logLikelyhood <-
  245. function(outCol,predCol) {
  246. sum(ifelse(outCol==pos,log(predCol),log(1-predCol)))
  247. }
  248. selVars <- c()
  249. minStep <- 5
  250. baseRateCheck <- logLikelyhood(training[,outcome],
  251. sum(training[,outcome]==pos)/length(training[,outcome]))
  252.  
  253. for(v in catVars) {
  254. pi<- paste('pred',v,sep='')
  255. liCheck <- 2*((logLikelyhood(training[,outcome],training[,pi]) - baseRateCheck))
  256. if(liCheck>minStep) {
  257. print(sprintf("%s,trainingScore: %g",
  258. pi,liCheck))
  259. selVars<- c(selVars,pi)
  260. }
  261. }
  262.  
  263. for(v in numericVars) {
  264. pi<- paste('pred',v,sep='')
  265. licheck <- 2*((logLikelyhood(training[,outcome],training[,pi])-baseRateCheck)-1)
  266. if(liCheck>=minStep) {
  267. print(sprintf("%s, trainingScore: %g",
  268. pi,liCheck))
  269. selVars<- c(selVars,pi)
  270. }
  271. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement