Guest User

Untitled

a guest
Jan 16th, 2019
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.52 KB | None | 0 0
  1. library(randomForest)
  2. library(xgboost)
  3. logistic <- function(x) exp(x) / (1 + exp(x))
  4. logloss <- function (target, pred) { -mean(ifelse(target > 0, log(pred), log(1-pred)))}
  5. acc <- function(target,pred){ sum(target==pred)/length(target)}
  6. set.seed(1829)
  7. n <- 10000
  8. # x1 <- rnorm(n)
  9. # x2 <- runif(n)
  10. # x3 <- rpois(n, 4.5)
  11. # inv_prob <- (-.2 + .3 * x1 + .1 * x2 + -.2 * x3 + # main effects
  12. # x1 * x2 + 1.4 * x2 * x3 + .1 * x1 * x2 * x3 - 3)/10# interactions
  13. x1 <- runif(n)
  14. x2 <- runif(n)
  15. inv_prob <- ( 1 * (x1 - x2))
  16.  
  17. y_prob <- logistic( inv_prob)
  18. y <- rbinom(n, 1, y_prob)
  19. train_frac <- .5
  20. cases <- sample(seq_len(n), train_frac * n)
  21. dat <- data.frame(x1, x2, y)
  22. train <- dat[cases, ]
  23. test <- dat[-cases, ]
  24.  
  25. x2_lr <- function(mdl, x1){
  26. m <- coefficients(mod_lr)
  27. -(m[['(Intercept)']] + x1*m[['x1']])/m[['x2']]
  28. }
  29. mod_lr <- glm(y ~ x1 + x2 , binomial, train)
  30. predtr_lr <- predict(mod_lr, train, type = "response")
  31. pred_lr <- predict(mod_lr, test, type = "response")
  32. coefficients(mod_lr)
  33. test$lr_prob <- pred_lr
  34. test$lr_class <- test$lr_prob > .5
  35.  
  36. acc_lr <- acc(test$y, test$lr_class)
  37.  
  38. mod_rf <- randomForest(factor(y) ~ ., train)
  39. pred_rf <- predict(mod_rf, test) == 1
  40. test$rf <- pred_rf
  41. acc_rf <- acc(test$y, test$rf)
  42.  
  43. acc_pop <- acc(test$y, (test$x1 - test$x2) > 0)
  44. print( c(acc_lr, acc_rf, acc_pop))
  45. dtrain <- xgb.DMatrix(as.matrix(train[c('x1', 'x2')]), label = train$y)
  46. dtest <- xgb.DMatrix(as.matrix(test[c('x1', 'x2')]), label = test$y)
  47. xgb_params = list(
  48. eta = 0.1,
  49. subsample=0.7,
  50.  
  51. objective = "binary:logistic",
  52. max_depth=5,
  53. eval.metric = "logloss"
  54. )
  55.  
  56. # cv <- xgb.cv(data = dtrain, nrounds = 50, nthread = 2, nfold = 5,
  57. # metrics = list("logloss"),
  58. # early_stopping_rounds=15,
  59. # max_depth = 5, eta = 0.05, objective = "binary:logistic")
  60. bst <- xgb.train(xgb_params, data = dtrain, nrounds=100,
  61. watchlist=list(train=dtrain,test=dtest),
  62. early_stopping_rounds=15)
  63.  
  64. predtr_bst <- predict(bst, as.matrix(train[c('x1', 'x2')]))
  65. pred_bst <- predict(bst, as.matrix(test[c('x1', 'x2')]))
  66.  
  67.  
  68.  
  69. classes.train <- ifelse(train$x1>train$x2, "blue", "orange")
  70. classes.test <- ifelse(test$x1>test$x2, "blue", "orange")
  71. ran <- seq(0,1,0.01)
  72. grid <- expand.grid(x1=ran, x2=ran)
  73. classes.grid <- predict(mod_rf, grid) ==1
  74. prob_lr.grid <- predict(mod_lr, grid, , type='response')
  75. contour_levels <- seq(min(prob_lr.grid), max(prob_lr.grid), length=10)
  76. prob_bst.grid <- predict(bst, as.matrix(grid))
  77. # plot the boundary
  78. # contour(x=ran, y=ran, z=matrix(classes.grid, nrow=length(ran)), levels=0.5,
  79. # col="grey", drawlabels=FALSE, lwd=2)
  80. contour(x=ran, y=ran, z=matrix(classes.grid, nrow=length(ran)), levels=.5,
  81. col="grey", drawlabels=FALSE, lwd=2)
  82.  
  83. contour(x=ran, y=ran, z=matrix(prob_bst.grid, nrow=length(ran)), levels=contour_levels,
  84. col="grey", drawlabels=TRUE)
  85. contour(x=ran, y=ran, z=matrix(prob_lr.grid, nrow=length(ran)), levels=contour_levels,
  86. col="grey", drawlabels=TRUE)
  87.  
  88. # add points from test dataset
  89. points(test[c('x1','x2')], col=test$y*3 + 1)
  90. #points(train[c('x1','x2')], col=train$y*3 + 2)
  91. lines(c(0,1), c(0,1),col=10)
  92. lines(c(0,1), x2_lr(mod_lr,c(0,1)), col=5)
  93.  
  94.  
  95. plot(test$x1,test$x2, col=1 + test$lr_class)
  96.  
  97. title(paste('logistic regression ',paste(mod_lr$coefficients)))
  98.  
  99. plot(test$x1,test$x2, col=5 + test$y)
  100. lines(c(0,1), c(0,1),col=1)
  101. title('test data')
  102.  
  103. plot(test$x1,test$x2, col=3 + test$rf)
  104. lines(c(0,1), c(0,1),col=1)
  105. title('random forest')
  106. cuts <- seq(0,1,.1) +0.05
  107. midpoints <- seq(0,.9,.1) +0.05
  108. test$x1c <- cut(test$x1,cuts)
  109. test$x2c <- cut(test$x2,cuts)
Add Comment
Please, Sign In to add comment