Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- require(xgboost)
- require(viopoints)
- ### Generate fake data ###
- # Generate features
- n = 1e4
- dat = data.frame(weight = round(rnorm(n, 170, 20), 2),
- state = sample(state.abb, n, replace = T),
- age = round(rnorm(n, 50, 10)),
- income = round(rnorm(n, 70, 20), 3),
- stringsAsFactors = F)
- # Generate outcome associated with features
- a = (dat$weight < 170 &
- dat$state %in% state.abb[1:20] &
- dat$age > 50 &
- dat$income < 70)
- b = (dat$weight < 180 &
- dat$state %in% state.abb[30:45] &
- dat$age < 30 &
- dat$income > 100)
- c = (dat$weight > 200 &
- dat$state %in% state.abb[1:50] &
- dat$age > 70 &
- dat$income > 50)
- d = (dat$weight > 150 & dat$weight < 190 &
- dat$state %in% state.abb[40:50] &
- dat$age > 40 & dat$age < 70 &
- dat$income > 90)
- idx = which(a | b | c | d)
- # Include some noise
- outcome = sample(0:1, n, prob= c(.95, .05), replace = T)
- outcome[sample(idx, .95*length(idx))] = 1
- ### Fit model ###
- # Get train/validation indices
- idx_train = sample(1:nrow(dat), .7*nrow(dat))
- idx_val = setdiff(1:nrow(dat), idx_train)
- # Format data for xgboost
- xgb_dat = dat
- xgb_dat$state = as.numeric(as.factor(xgb_dat$state))
- xgb_dat = as.matrix(xgb_dat)
- dtrain = xgb.DMatrix(data = xgb_dat[idx_train,], label = outcome[idx_train])
- dval = xgb.DMatrix(data = xgb_dat[idx_val,], label = outcome[idx_val])
- # Set hyperparameters
- xgb_params = list(max.depth = 3,
- eta = 0.1,
- min_child_weight = 1,
- subsample = 0.5,
- colsample_bytree = 0.5,
- colsample_bylevel = 0.5,
- lambda = 0.1,
- alpha = 0,
- gamma = 0,
- max_delta_step = 0,
- scale_pos_weight = 1/mean(outcome[idx_train]),
- num_parallel_tree = 1)
- # Train model
- bst = xgb.train(data = dtrain,
- params = xgb_params,
- nrounds = 500,
- watchlist = list(train = dtrain, validate = dval),
- base = 0.5,
- objective = "binary:logistic",
- eval_metric = "auc",
- early_stopping_rounds = 100)
- ### Results ###
- # Extract predictions and calc stats
- pred_prob = predict(bst, dval)
- pred_class = ifelse(pred_prob > 0.5, 1, 0)
- val_class = outcome[idx_val]
- xgb_imp = xgb.importance(colnames(dat), bst)
- stats = data.frame(TPR = sum(pred_class == 1 & val_class == 1)/sum(val_class == 1),
- FPR = sum(pred_class == 0 & val_class == 1)/sum(val_class == 1),
- TNR = sum(pred_class == 0 & val_class == 0)/sum(val_class == 0),
- FNR = sum(pred_class == 1 & val_class == 0)/sum(val_class == 0))
- par(mfrow = c(2, 1))
- txt_stats = paste(names(stats), round(stats, 3), sep = " = ")
- viopoints(pred_prob~val_class, pch = 21, bg = c("Red", "Blue"),
- xlab = "Actual Class", ylab = "Pr(Class 1)",
- main = c(paste(txt_stats[1:2], collapse = " "),
- paste(txt_stats[3:4], collapse = " ")))
- abline(h = 0.5, lty = 2, lwd = 2); grid()
- xgb.plot.importance(xgb_imp, main = "Importance", xlab = "Gain")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement