Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### Preparation
- StoreAsAFM <- function(vars, data, afmFile) {
- ### Stores a data.frame as an AFM file. Causal effect decision tree is get passed an AFM data file
- # vars: the variables to include
- # data: data.frame
- # afmFile: output filename
- ### It handles numerical, logical, and character variables. Missing values are alowed
- cat(c('.', 1:nrow(data)), file = afmFile, sep = '\t')
- cat('\n', file = afmFile, append = T)
- for (v in vars) {
- feature <- data[[v]]
- if(is.factor(feature)) {
- feature <- as.character(feature)
- }
- if (is.numeric(feature)) {
- fieldAnnotation <- sprintf('N:%s', v)
- } else if (is.character(feature)) {
- feature[feature == ''] <- 'NA'
- fieldAnnotation <- sprintf('C:%s', v)
- } else if (is.logical(feature)) {
- fieldAnnotation <- sprintf('C:%s', v)
- } else {
- stop(sprintf("%s takes no type", v))
- }
- print(fieldAnnotation)
- cat(c(fieldAnnotation, feature), file = afmFile, sep = '\t', append = T)
- cat('\n', file = afmFile, append = T)
- }
- }
- options(width=180)
- library(readr)
- u <- read_csv("profile_data.csv")
- u <- subset(u, u$joinDateUTC < as.Date("2016-02-04")-14)
- p <- read_csv("purchase_daily.csv")
- p <- subset(p, p$day <= 14)
- pp <- tapply(p$value, p$userId, sum)
- uu <- merge(u, data.frame(userId=as.integer(names(pp)), value=pp), by="userId", all.x=T)
- uu$value[is.na(uu$value)] <- 0
- write.csv(uu, "user_value.csv")
- # uu <- read.csv("user_value.csv")
- segments <- as.character(unique(uu$segment))
- control <- segments[grepl("Control", segments)]
- variations <- segments[!grepl("Control", segments)]
- vars <- c("kifCountry", "manufacturer", "mvnoName", "osVersion", "sdkVersion", "totalRamInKB", "listPrice", "primaryStorageTotalInMB", "timeZoneMin", "xDpi", "yDpi", "densityDpi", "heightPixels", "widthPixels", "value", "variation")
- uu$variation <- uu$segment
- training <- runif(nrow(uu)) < 0.6
- StoreAsAFM(vars, subset(uu, training), "training_multiarm.afm")
- StoreAsAFM(vars, subset(uu, !training), "testing_multiarm.afm")
- write.csv(subset(uu, training), "training_multiarm.csv")
- write.csv(subset(uu, !training), "testing_multiarm.csv")
- ### Cross-validation
- uu <- read.csv("testing_multiarm.csv")
- uul <- subset(uu, (leftNode))
- uur <- subset(uu, !(leftNode) & !is.null(uu$billingCountry))
- tapply(uul$value, uul$variation, mean)
- tapply(uur$value, uur$variation, mean)
- bv <- boot(data=subset(uul, uul$variation), statistic=function(data, indices) {mean(data$value[indices])}, R = 300)
- bc <- boot(data=subset(uul, !uul$variation), statistic=function(data, indices) {mean(data$value[indices])}, R = 300)
- mean(bv$t > bc$t)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement