Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- install.packages("RPostgreSQL")
- library("RPostgreSQL")
- drv = dbDriver("PostgreSQL")
- conn = dbConnect(drv,host="cloud2go-redshift-invoice2go.cvijiumx2xhk.us-west-2.redshift.amazonaws.com",port="5439",dbname="cloud2go",user="vishal",password="4L23BYpiajNbR9Dn8m48TdV3")
- #insert query into variable churn
- #churn = dbGetQuery(conn,"select...")
- #this set converts particular columns to a particular type
- #if you add a column to the original query, make sure to check that the below code adjusts the right columns
- churn[c(12,13,14)] = sapply(churn[c(12,13,14)],as.logical)
- churn[c(2,3,5,6,8,9,10,15,16,17,18,19)] = sapply(churn[c(2,3,5,6,8,9,10,15,16,17,18,19)],as.factor)
- churn_matrix = as.matrix(churn)
- churn_df = as.data.frame(churn_matrix[,-1])
- #data splitting function for 75% training vs 25% test data
- splitdf = function(dataframe,seed=NULL) {
- if (!is.null(seed)) set.seed(seed)
- index = 1:nrow(dataframe)
- trainIndex = sample(index,trunc(length(index)*.75))
- trainSet = dataframe[trainIndex, ]
- testSet = dataframe[-trainIndex, ]
- list(trainSet=trainSet,testSet=testSet)
- }
- #apply function to split dataframe
- splits <- splitdf(churn_df, seed=808)
- lapply(splits, nrow)
- lapply(splits,head)
- training = splits$trainSet
- testing = splits$testSet
- #which column do we want as the outcome
- outcome = 'churned'
- #what is the positive outcome of this column
- #positive will be 0 since we're looking at 1's as churned vs. 0's as retained
- pos = 'FALSE'
- #enter in any categorical variable from the dataset to see the breakout to the outcome variable (churn)
- table218 = table(
- categorical_variable=training[,'doc_recency'],
- churn = training[,outcome],
- useNA= 'ifany')
- print(table218)
- print(table218[,2]/(table218[,1]+table218[,2]))
- #testing data for above
- table218 = table(
- categorical_variable=testing[,'doc_recency'],
- churn = testing[,outcome],
- useNA= 'ifany')
- print(table218)
- print(table218[,2]/(table218[,1]+table218[,2]))
- #function to predict a variable as the outcome of churn
- mkPredC = function(outCol,varCol,appCol) {
- pPos = sum(outCol==pos)/length(outCol)
- naTab = table(as.factor(outCol[is.na(varCol)]))
- pPosWna = (naTab/sum(naTab))[pos]
- vTab = table(as.factor(outCol),varCol)
- pPosWv = (vTab[pos, ]+1.0e-3*pPos)/(colSums(vTab)+1.0e-3)
- pred = pPosWv[appCol]
- pred[is.na(appCol)] = pPosWna
- pred[is.na(pred)] = pPos
- pred
- }
- #set the categorical variables
- catVars = c(training$platform_created_on,training$country,training$month_of_registered,training$month_of_purchased,training$days_to_first_purchase,training$first_productid,training$first_purchased_via,training$total_devices,training$clientlist,training$productlist,training$doc_count,training$doc_recency,training$email_domain,training$industry,training$invoice_bucket)
- #looping to do multiple categorical variables at once
- #takes a while to run ~30 mins
- for(v in catVars) {
- pi = paste('pred',v,sep='')
- training[,pi] = mkPredC(training[,outcome],training[,v],training[,v])
- testing[,pi] = mkPredC(training[,outcome],training[,v],testing[,v])
- }
- #68% of users who have a client list did not churn
- #98.9% of users who created a doc within 30 days of expiring did not churn, 77% of users who created a doc within 60 days of expiring did not churn, 18% of users who created a doc >60 days did not churn
- #50% of users with only 1 device do not churn, 73% of users with 2 devices do not churn, 87% of users with 3 or more devices do not churn
- #clustering and forests
- library(rpart)
- library(party)
- library(randomForest)
- library(rattle)
- library(ggplot2)
- #RANDOM FORESTS
- ##need to bucket doc_count as forest wont plot more than 53 buckets
- #took out industry from random forest for now
- fit_forest = randomForest(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training, importance = TRUE, keep.forest=TRUE)
- importance(fit_forest)
- varImpPlot(fit_forest)
- #generates importance of each variable, higher gini the more predictive
- #END RANDOM FORESTS
- #TREE BUILDING
- #using party package
- fit_party = ctree(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training)
- plot(fit_party)
- #simplify above ctree
- fit_party2 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = training)
- fit_party3 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = vartesting)
- #write a plot to a file for easier viewing, can adjust height and width
- png("tree13_mar14_may14_w_4_variables_inv_bucket_testing", res=80, height=1800, width=3600)
- plot(fit_party3, gp = gpar(fontsize = 4), # font size changed to 6
- inner_panel=node_inner,
- ip_args=list(
- abbreviate = FALSE,
- id = FALSE)
- )
- dev.off()
- #END TREE BUILDING
- #RPART
- fit_rpart = rpart(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, method = "class",data =training)
- #can use prp instead of plot
- plot(fit_rpart,uniform = TRUE)
- text(fit_rpart,use.n=TRUE,all=TRUE,cex=.8)
- fancyRpartPlot(fit_rpart)
- #rpart prediction
- library(rpart)
- fV = paste(outcome,'churned ~ ',
- paste(c(catVars,numericVars),collapse = ' + '),sep='')
- tmodel = rpart(fV,data=training)
- print(calcAUC(predict(tmodel,newdata=testing),testing[,outcome]))
- #END RPART
- #GOWER MATRIX DISTANCES
- #gower distance of categorical variables for clustering
- #GOWER distance will take a really long time to run
- library(cluster)
- d= daisy(training,metric= "gower",stand=FALSE)
- #use gower distance to create 5 clusters
- clusters = pam(x=d,k=5,diss=TRUE)
- hclustfunc <- function(x) hclust(x, method="complete")
- gower = hclustfunc(d)
- #END GOWER
- #LOGISTIC REGRESSION
- #set the logistic model
- #make sure family is binomial since our outcome is T/F
- #link = logit links the output to a linear model
- model = glm(churned ~ month_of_registered + month_of_purchased + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid + total_devices + productlist + clientlist + doc_count + doc_recency + email_domain + invoice_bucket, data =training,family = binomial(link="logit"))
- training$pred = predict(model,newdata=training, type ="response")
- testing$pred = predict(model,newdata=testing,type ="response")
- #now we need to plot the scores of the negative and positive instances. we want high scores for positive instances and low scores otherwise
- library(ggplot2)
- ggplot(training, aes(x=pred, color= churned, linetype = churned)) +geom_density()
- #look for estimates where Pr(>z) is less than 0.05. these are strong negative/positive indicators
- #exp(estimate) will be the odds of churn =TRUE (if estimate is negative, churn = FALSE if estimate is positive) compared to retained with all other variables equal
- summary(model)
- #calculating deviance from summary above
- loglikelihood = function(y,py) {
- sum(y * log(py) + (1-y) * log(1-py))
- }
- pnull = mean(as.numeric(training$churned))
- null.dev =-2*loglikelihood(as.numeric(training$churned),pnull)
- model$null.deviance
- pred = predict(model,newdata = training, type = "response")
- resid.dev = -2*loglikelihood(as.numeric(training$churned),pred)
- resid.dev
- model$deviance
- testy = as.numeric(training$churned)
- testpred = predict(model,newdata=testing, type = "response")
- pnull.test = mean(testy)
- null.dev.test = -2*loglikelihood(testy, pnull.test)
- resid.dev.test = -2*loglikelihood(testy,testpred)
- pnull.test
- null.dev.test
- resid.dev.test
- #calculate significance of observed fit
- #degrees of freedom
- df.null = dim(training)[[1]]-1
- df.model = dim(training)[[1]] - length(model$coefficients)
- df.null
- df.model
- deldev = null.dev - resid.dev
- deldf = df.null-df.model
- p = pchisq(deldev,deldf,lower.tail = F)
- deldev
- deldf
- p
- #exploring log reg model trade offs (precision vs. recall)
- library(ROCR)
- library(grid)
- predObj = prediction(training$pred,training$churned)
- precObj = performance(predObj,measure = "prec")
- recObj = performance(predObj,measure = "rec")
- precision = (precObj@y.values)[[1]]
- prec.x = (precObj@x.values)[[1]]
- recall = (recObj@y.values)[[1]]
- rocFrame = data.frame(threshold=prec.x, precision = precision, recall = recall)
- #function to plot multiple plots on 1 page (stacked)
- nplot = function(plist) {
- n= length(plist)
- grid.newpage()
- pushViewport(viewport(layout = grid.layout(n,1)))
- vplayout = function(x,y) {viewport(layout.pos.row = x, layout.pos.col = y)}
- for(i in 1:n) {
- print(plist[[i]], vp=vplayout(i,1))
- }
- }
- pnull = mean(as.numeric(training$churned))
- #change xlim ylim based on characteristic
- #plots recall vs. precision
- p1 = ggplot(rocFrame, aes(x=threshold)) + geom_line(aes(y=precision/pnull)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
- p2 = ggplot(rocFrame, aes(x=threshold))+geom_line(aes(y=recall)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
- nplot(list(p1,p2))
- ctab.test = table(pred=testing$pred >0.5, churned = testing$churned)
- #NEED TO FIX BELOW CODE, CURRENTLY NOT WORKING
- logLikelyhood <-
- function(outCol,predCol) {
- sum(ifelse(outCol==pos,log(predCol),log(1-predCol)))
- }
- selVars <- c()
- minStep <- 5
- baseRateCheck <- logLikelyhood(training[,outcome],
- sum(training[,outcome]==pos)/length(training[,outcome]))
- for(v in catVars) {
- pi<- paste('pred',v,sep='')
- liCheck <- 2*((logLikelyhood(training[,outcome],training[,pi]) - baseRateCheck))
- if(liCheck>minStep) {
- print(sprintf("%s,trainingScore: %g",
- pi,liCheck))
- selVars<- c(selVars,pi)
- }
- }
- for(v in numericVars) {
- pi<- paste('pred',v,sep='')
- licheck <- 2*((logLikelyhood(training[,outcome],training[,pi])-baseRateCheck)-1)
- if(liCheck>=minStep) {
- print(sprintf("%s, trainingScore: %g",
- pi,liCheck))
- selVars<- c(selVars,pi)
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement