Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(randomForest)
- library(xgboost)
- logistic <- function(x) exp(x) / (1 + exp(x))
- logloss <- function (target, pred) { -mean(ifelse(target > 0, log(pred), log(1-pred)))}
- acc <- function(target,pred){ sum(target==pred)/length(target)}
- set.seed(1829)
- n <- 10000
- # x1 <- rnorm(n)
- # x2 <- runif(n)
- # x3 <- rpois(n, 4.5)
- # inv_prob <- (-.2 + .3 * x1 + .1 * x2 + -.2 * x3 + # main effects
- # x1 * x2 + 1.4 * x2 * x3 + .1 * x1 * x2 * x3 - 3)/10# interactions
- x1 <- runif(n)
- x2 <- runif(n)
- inv_prob <- ( 1 * (x1 - x2))
- y_prob <- logistic( inv_prob)
- y <- rbinom(n, 1, y_prob)
- train_frac <- .5
- cases <- sample(seq_len(n), train_frac * n)
- dat <- data.frame(x1, x2, y)
- train <- dat[cases, ]
- test <- dat[-cases, ]
- x2_lr <- function(mdl, x1){
- m <- coefficients(mod_lr)
- -(m[['(Intercept)']] + x1*m[['x1']])/m[['x2']]
- }
- mod_lr <- glm(y ~ x1 + x2 , binomial, train)
- predtr_lr <- predict(mod_lr, train, type = "response")
- pred_lr <- predict(mod_lr, test, type = "response")
- coefficients(mod_lr)
- test$lr_prob <- pred_lr
- test$lr_class <- test$lr_prob > .5
- acc_lr <- acc(test$y, test$lr_class)
- mod_rf <- randomForest(factor(y) ~ ., train)
- pred_rf <- predict(mod_rf, test) == 1
- test$rf <- pred_rf
- acc_rf <- acc(test$y, test$rf)
- acc_pop <- acc(test$y, (test$x1 - test$x2) > 0)
- print( c(acc_lr, acc_rf, acc_pop))
- dtrain <- xgb.DMatrix(as.matrix(train[c('x1', 'x2')]), label = train$y)
- dtest <- xgb.DMatrix(as.matrix(test[c('x1', 'x2')]), label = test$y)
- xgb_params = list(
- eta = 0.1,
- subsample=0.7,
- objective = "binary:logistic",
- max_depth=5,
- eval.metric = "logloss"
- )
- # cv <- xgb.cv(data = dtrain, nrounds = 50, nthread = 2, nfold = 5,
- # metrics = list("logloss"),
- # early_stopping_rounds=15,
- # max_depth = 5, eta = 0.05, objective = "binary:logistic")
- bst <- xgb.train(xgb_params, data = dtrain, nrounds=100,
- watchlist=list(train=dtrain,test=dtest),
- early_stopping_rounds=15)
- predtr_bst <- predict(bst, as.matrix(train[c('x1', 'x2')]))
- pred_bst <- predict(bst, as.matrix(test[c('x1', 'x2')]))
- classes.train <- ifelse(train$x1>train$x2, "blue", "orange")
- classes.test <- ifelse(test$x1>test$x2, "blue", "orange")
- ran <- seq(0,1,0.01)
- grid <- expand.grid(x1=ran, x2=ran)
- classes.grid <- predict(mod_rf, grid) ==1
- prob_lr.grid <- predict(mod_lr, grid, , type='response')
- contour_levels <- seq(min(prob_lr.grid), max(prob_lr.grid), length=10)
- prob_bst.grid <- predict(bst, as.matrix(grid))
- # plot the boundary
- # contour(x=ran, y=ran, z=matrix(classes.grid, nrow=length(ran)), levels=0.5,
- # col="grey", drawlabels=FALSE, lwd=2)
- contour(x=ran, y=ran, z=matrix(classes.grid, nrow=length(ran)), levels=.5,
- col="grey", drawlabels=FALSE, lwd=2)
- contour(x=ran, y=ran, z=matrix(prob_bst.grid, nrow=length(ran)), levels=contour_levels,
- col="grey", drawlabels=TRUE)
- contour(x=ran, y=ran, z=matrix(prob_lr.grid, nrow=length(ran)), levels=contour_levels,
- col="grey", drawlabels=TRUE)
- # add points from test dataset
- points(test[c('x1','x2')], col=test$y*3 + 1)
- #points(train[c('x1','x2')], col=train$y*3 + 2)
- lines(c(0,1), c(0,1),col=10)
- lines(c(0,1), x2_lr(mod_lr,c(0,1)), col=5)
- plot(test$x1,test$x2, col=1 + test$lr_class)
- title(paste('logistic regression ',paste(mod_lr$coefficients)))
- plot(test$x1,test$x2, col=5 + test$y)
- lines(c(0,1), c(0,1),col=1)
- title('test data')
- plot(test$x1,test$x2, col=3 + test$rf)
- lines(c(0,1), c(0,1),col=1)
- title('random forest')
- cuts <- seq(0,1,.1) +0.05
- midpoints <- seq(0,.9,.1) +0.05
- test$x1c <- cut(test$x1,cuts)
- test$x2c <- cut(test$x2,cuts)
Add Comment
Please, Sign In to add comment