Advertisement
Guest User

Untitled

a guest
Feb 8th, 2017
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.05 KB | None | 0 0
  1. library(RJDBC)
  2. library(reshape)
  3. library(dplyr)
  4. library(Hmisc)
  5. library(party)
  6. library(MASS)
  7. library(Boruta)
  8. library(randomForest)
  9. library(caret)
  10. library(car)
  11. library(AUC)
  12. library(party)
  13. library(Information)
  14.  
  15.  
  16. # clean up the variables
  17. rm(list = setdiff(ls(), lsf.str()))
  18.  
  19. user <- "hac"
  20. pwd <- "pwd"
  21.  
  22.  
  23. tenant <- 3625
  24. instance <- "madkudu-prod-3"
  25.  
  26.  
  27.  
  28. pgsql <- JDBC("org.postgresql.Driver", "/Users/hacphan/lib/postgresql-9.4.1212.jar", "`")
  29. con <- paste("jdbc:postgresql://", instance, ".cqkkyeyuyqzv.us-east-1.redshift.amazonaws.com:5439/madkudu?user=", user, "&password=", pwd, "&ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory", sep="")
  30. madkudu_db <- dbConnect(pgsql, con)
  31.  
  32. sql_table_name = paste('mk_',tenant,'_1.prototype_cfit_training', sep="")
  33.  
  34. cohort_data_c <- dbGetQuery(madkudu_db, paste("SELECT * FROM ((SELECT target AS has_converted, * FROM ",sql_table_name," WHERE target = 1) UNION ALL (SELECT target AS has_converted, * FROM ",sql_table_name," WHERE target = 0 ORDER BY RANDOM() LIMIT 60000)) ORDER BY RANDOM();"))
  35. data_for_fitting_c = dplyr::select(cohort_data_c, -id, -target)
  36.  
  37. dfNa_c <- data_for_fitting_c[,colSums(is.na(data_for_fitting_c))<nrow(data_for_fitting_c)]
  38. df_c <- dfNa_c[, colSums(dfNa_c != 0) > 0]
  39. smp_size <- floor(0.60 * nrow(df_c))
  40.  
  41. ## set the seed to make your partition reproductible
  42. set.seed(12345)
  43. train_ind <- sample(seq_len(nrow(df_c)), size = smp_size)
  44.  
  45. # training and testing data sets
  46. train <- df_c[train_ind, ]
  47. test <- df_c[-train_ind, ]
  48.  
  49. # train regression:
  50.  
  51. glm_fit <- glm(has_converted ~ ., data=train , family = binomial)
  52.  
  53. # train tree:
  54. tree_fit <- ctree(has_converted ~ mk_employee+mk_funding+mk_alexa+mk_marketcap+mk_tech_count, data=train, controls = ctree_control(minsplit=250))
  55.  
  56. ## see results:
  57. see_results(glm_fit, test, 100)
  58. see_results(tree_fit, test, 100)
  59.  
  60. ## information:
  61. IV <- create_infotables(data=cohort_data_c, y="has_converted", NULL)
  62.  
  63. ## look at distr of scores
  64. test$score_reg = predict(glm_fit, newdata = test, type ="response")
  65. test$score_tree = predict(tree_fit, newdata = test, type ="response")
  66.  
  67. hist(test$score_reg)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement