Untitled

install.packages("RPostgreSQL")
library("RPostgreSQL")
drv = dbDriver("PostgreSQL")
conn = dbConnect(drv,host="cloud2go-redshift-invoice2go.cvijiumx2xhk.us-west-2.redshift.amazonaws.com",port="5439",dbname="cloud2go",user="vishal",password="4L23BYpiajNbR9Dn8m48TdV3")

#insert query into variable churn
#churn = dbGetQuery(conn,"select...")

#this set converts particular columns to a particular type
#if you add a column to the original query, make sure to check that the below code adjusts the right columns
churn[c(12,13,14)] = sapply(churn[c(12,13,14)],as.logical)
churn[c(2,3,5,6,8,9,10,15,16,17,18,19)] = sapply(churn[c(2,3,5,6,8,9,10,15,16,17,18,19)],as.factor)
churn_matrix = as.matrix(churn)
churn_df = as.data.frame(churn_matrix[,-1])

#data splitting function for 75% training vs 25% test data
splitdf = function(dataframe,seed=NULL) {
  if (!is.null(seed)) set.seed(seed)
  index = 1:nrow(dataframe)
  trainIndex = sample(index,trunc(length(index)*.75))
  trainSet = dataframe[trainIndex, ]
  testSet = dataframe[-trainIndex, ]
  list(trainSet=trainSet,testSet=testSet)
}
#apply function to split dataframe
splits <- splitdf(churn_df, seed=808)
lapply(splits, nrow)
lapply(splits,head)
training = splits$trainSet
testing = splits$testSet

#which column do we want as the outcome
outcome = 'churned'
#what is the positive outcome of this column
#positive will be 0 since we're looking at 1's as churned vs. 0's as retained
pos = 'FALSE'

#enter in any categorical variable from the dataset to see the breakout to the outcome variable (churn)
table218 = table(
  categorical_variable=training[,'doc_recency'],
  churn = training[,outcome],
  useNA= 'ifany')
print(table218)

print(table218[,2]/(table218[,1]+table218[,2]))

#testing data for above
table218 = table(
  categorical_variable=testing[,'doc_recency'],
  churn = testing[,outcome],
  useNA= 'ifany')
print(table218)

print(table218[,2]/(table218[,1]+table218[,2]))


#function to predict a variable as the outcome of churn
mkPredC = function(outCol,varCol,appCol) {
  pPos = sum(outCol==pos)/length(outCol)
  naTab = table(as.factor(outCol[is.na(varCol)]))
  pPosWna = (naTab/sum(naTab))[pos]
  vTab = table(as.factor(outCol),varCol)
  pPosWv = (vTab[pos, ]+1.0e-3*pPos)/(colSums(vTab)+1.0e-3)
  pred = pPosWv[appCol]
  pred[is.na(appCol)] = pPosWna
  pred[is.na(pred)] = pPos
  pred
}

#set the categorical variables
catVars = c(training$platform_created_on,training$country,training$month_of_registered,training$month_of_purchased,training$days_to_first_purchase,training$first_productid,training$first_purchased_via,training$total_devices,training$clientlist,training$productlist,training$doc_count,training$doc_recency,training$email_domain,training$industry,training$invoice_bucket)

#looping to do multiple categorical variables at once
  #takes a while to run ~30 mins
for(v in catVars) {
  pi = paste('pred',v,sep='')
  training[,pi] = mkPredC(training[,outcome],training[,v],training[,v])
  testing[,pi] = mkPredC(training[,outcome],training[,v],testing[,v])
}

#68% of users who have a client list did not churn
#98.9% of users who created a doc within 30 days of expiring did not churn, 77% of users who created a doc within 60 days of expiring did not churn, 18% of users who created a doc >60 days did not churn
#50% of users with only 1 device do not churn, 73% of users with 2 devices do not churn, 87% of users with 3 or more devices do not churn


#clustering and forests
library(rpart)
library(party)
library(randomForest)
library(rattle)
library(ggplot2)

#RANDOM FORESTS
##need to bucket doc_count as forest wont plot more than 53 buckets
#took out industry from random forest for now
fit_forest = randomForest(churned ~ month_of_registered + month_of_purchased  + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid  + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training, importance = TRUE, keep.forest=TRUE)
importance(fit_forest)
varImpPlot(fit_forest)
#generates importance of each variable, higher gini the more predictive

#END RANDOM FORESTS

#TREE BUILDING
#using party package
fit_party = ctree(churned ~ month_of_registered + month_of_purchased  + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid  + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, data = training)
plot(fit_party)
#simplify above ctree
fit_party2 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = training)
fit_party3 = ctree(churned ~ doc_recency + doc_count + invoice_bucket + total_devices, data = vartesting)


#write a plot to a file for easier viewing, can adjust height and width
png("tree13_mar14_may14_w_4_variables_inv_bucket_testing", res=80, height=1800, width=3600)
plot(fit_party3, gp = gpar(fontsize = 4),     # font size changed to 6
     inner_panel=node_inner,
     ip_args=list(
       abbreviate = FALSE,
       id = FALSE)
)
dev.off()

#END TREE BUILDING


#RPART
fit_rpart = rpart(churned ~ month_of_registered + month_of_purchased  + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid  + total_devices + productlist + clientlist + +doc_count + doc_recency + email_domain + invoice_bucket, method = "class",data =training)
#can use prp instead of plot
plot(fit_rpart,uniform = TRUE)
text(fit_rpart,use.n=TRUE,all=TRUE,cex=.8)
fancyRpartPlot(fit_rpart)

#rpart prediction
library(rpart)
fV = paste(outcome,'churned ~ ',
           paste(c(catVars,numericVars),collapse = ' + '),sep='')
tmodel = rpart(fV,data=training)
print(calcAUC(predict(tmodel,newdata=testing),testing[,outcome]))

#END RPART

#GOWER MATRIX DISTANCES

#gower distance of categorical variables for clustering
#GOWER distance will take a really long time to run
library(cluster)
d= daisy(training,metric= "gower",stand=FALSE)
#use gower distance to create 5 clusters
clusters = pam(x=d,k=5,diss=TRUE)
hclustfunc <- function(x) hclust(x, method="complete")
gower = hclustfunc(d)

#END GOWER

#LOGISTIC REGRESSION

#set the logistic model
#make sure family is binomial since our outcome is T/F
#link = logit links the output to a linear model
model = glm(churned ~ month_of_registered + month_of_purchased  + platform_created_on + first_purchased_via + country + days_to_first_purchase + first_productid  + total_devices + productlist + clientlist + doc_count + doc_recency + email_domain + invoice_bucket, data =training,family = binomial(link="logit"))
training$pred = predict(model,newdata=training, type ="response")
testing$pred = predict(model,newdata=testing,type ="response")
#now we need to plot the scores of the negative and positive instances. we want high scores for positive instances and low scores otherwise
library(ggplot2)
ggplot(training, aes(x=pred, color= churned, linetype = churned)) +geom_density()
#look for estimates where Pr(>z) is less than 0.05. these are strong negative/positive indicators
#exp(estimate) will be the odds of churn =TRUE (if estimate is negative, churn = FALSE if estimate is positive) compared to retained with all other variables equal
summary(model)

#calculating deviance from summary above
loglikelihood = function(y,py) {
  sum(y * log(py) + (1-y) * log(1-py))
}

pnull = mean(as.numeric(training$churned))
null.dev =-2*loglikelihood(as.numeric(training$churned),pnull)
model$null.deviance
pred = predict(model,newdata = training, type = "response")
resid.dev = -2*loglikelihood(as.numeric(training$churned),pred)
resid.dev
model$deviance
testy = as.numeric(training$churned)
testpred = predict(model,newdata=testing, type = "response")
pnull.test = mean(testy)
null.dev.test = -2*loglikelihood(testy, pnull.test)
resid.dev.test = -2*loglikelihood(testy,testpred)
pnull.test
null.dev.test
resid.dev.test

#calculate significance of observed fit
#degrees of freedom
df.null = dim(training)[[1]]-1
df.model = dim(training)[[1]] - length(model$coefficients)
df.null
df.model
deldev = null.dev - resid.dev
deldf = df.null-df.model
p = pchisq(deldev,deldf,lower.tail = F)
deldev
deldf
p

#exploring log reg model trade offs (precision vs. recall)
library(ROCR)
library(grid)
predObj = prediction(training$pred,training$churned)
precObj = performance(predObj,measure = "prec")
recObj = performance(predObj,measure = "rec")
precision = (precObj@y.values)[[1]]
prec.x = (precObj@x.values)[[1]]
recall = (recObj@y.values)[[1]]

rocFrame = data.frame(threshold=prec.x, precision = precision, recall = recall)
#function to plot multiple plots on 1 page (stacked)
nplot = function(plist) {
  n= length(plist)
  grid.newpage()
  pushViewport(viewport(layout = grid.layout(n,1)))
  vplayout = function(x,y) {viewport(layout.pos.row = x, layout.pos.col = y)}
  for(i in 1:n) {
    print(plist[[i]], vp=vplayout(i,1))
  }
}


pnull = mean(as.numeric(training$churned))
#change xlim ylim based on characteristic
#plots recall vs. precision
p1 = ggplot(rocFrame, aes(x=threshold)) + geom_line(aes(y=precision/pnull)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
p2 = ggplot(rocFrame, aes(x=threshold))+geom_line(aes(y=recall)) + coord_cartesian(xlim = c(0,1),ylim=c(0,2) )
nplot(list(p1,p2))

ctab.test = table(pred=testing$pred >0.5, churned = testing$churned)


#NEED TO FIX BELOW CODE, CURRENTLY NOT WORKING

logLikelyhood <-
  function(outCol,predCol) {
    sum(ifelse(outCol==pos,log(predCol),log(1-predCol)))
  }
selVars <- c()
minStep <- 5
baseRateCheck <- logLikelyhood(training[,outcome],
                               sum(training[,outcome]==pos)/length(training[,outcome]))

for(v in catVars) {
  pi<- paste('pred',v,sep='')
  liCheck <- 2*((logLikelyhood(training[,outcome],training[,pi]) - baseRateCheck))
  if(liCheck>minStep) {
    print(sprintf("%s,trainingScore: %g",
                  pi,liCheck))
    selVars<- c(selVars,pi)
  }
}

for(v in numericVars) {
  pi<- paste('pred',v,sep='')
  licheck <- 2*((logLikelyhood(training[,outcome],training[,pi])-baseRateCheck)-1)
  if(liCheck>=minStep) {
    print(sprintf("%s, trainingScore: %g",
                  pi,liCheck))
    selVars<- c(selVars,pi)
  }
}