Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(RJDBC)
- library(reshape)
- library(dplyr)
- library(Hmisc)
- library(party)
- library(MASS)
- library(Boruta)
- library(randomForest)
- library(caret)
- library(car)
- library(AUC)
- library(party)
- library(Information)
- # clean up the variables
- rm(list = setdiff(ls(), lsf.str()))
- user <- "hac"
- pwd <- "pwd"
- tenant <- 3625
- instance <- "madkudu-prod-3"
- pgsql <- JDBC("org.postgresql.Driver", "/Users/hacphan/lib/postgresql-9.4.1212.jar", "`")
- con <- paste("jdbc:postgresql://", instance, ".cqkkyeyuyqzv.us-east-1.redshift.amazonaws.com:5439/madkudu?user=", user, "&password=", pwd, "&ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory", sep="")
- madkudu_db <- dbConnect(pgsql, con)
- sql_table_name = paste('mk_',tenant,'_1.prototype_cfit_training', sep="")
- cohort_data_c <- dbGetQuery(madkudu_db, paste("SELECT * FROM ((SELECT target AS has_converted, * FROM ",sql_table_name," WHERE target = 1) UNION ALL (SELECT target AS has_converted, * FROM ",sql_table_name," WHERE target = 0 ORDER BY RANDOM() LIMIT 60000)) ORDER BY RANDOM();"))
- data_for_fitting_c = dplyr::select(cohort_data_c, -id, -target)
- dfNa_c <- data_for_fitting_c[,colSums(is.na(data_for_fitting_c))<nrow(data_for_fitting_c)]
- df_c <- dfNa_c[, colSums(dfNa_c != 0) > 0]
- smp_size <- floor(0.60 * nrow(df_c))
- ## set the seed to make your partition reproductible
- set.seed(12345)
- train_ind <- sample(seq_len(nrow(df_c)), size = smp_size)
- # training and testing data sets
- train <- df_c[train_ind, ]
- test <- df_c[-train_ind, ]
- # train regression:
- glm_fit <- glm(has_converted ~ ., data=train , family = binomial)
- # train tree:
- tree_fit <- ctree(has_converted ~ mk_employee+mk_funding+mk_alexa+mk_marketcap+mk_tech_count, data=train, controls = ctree_control(minsplit=250))
- ## see results:
- see_results(glm_fit, test, 100)
- see_results(tree_fit, test, 100)
- ## information:
- IV <- create_infotables(data=cohort_data_c, y="has_converted", NULL)
- ## look at distr of scores
- test$score_reg = predict(glm_fit, newdata = test, type ="response")
- test$score_tree = predict(tree_fit, newdata = test, type ="response")
- hist(test$score_reg)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement