Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #' Target Shuffling
- #' Author: Taylor Van Anne
- #'
- #' Note: this is just my interpretation of what target shuffling means
- #' to me. I think there are a few different ways to actually conduct
- #' the shuffling, but this is a single approach.
- #'
- #' A different approach than what I did here would be to shuffle the
- #' entire target variable before the train/test split. I chose to
- #' instead shuffle only within the test label values (after splitting
- #' the label values into train/test)
- # load libraries
- library(randomForest)
- library(ggplot2)
- # this is the number of iterations of model building
- num_iters <- 100
- # allocating numeric vector space to store our results as we loop
- results <- numeric(num_iters)
- results_shuffled <- numeric(num_iters)
- # setting a random seed for reproducibility
- set.seed(4)
- # begin the loop:
- for(i in 1:num_iters) {
- # replicate the iris data
- myiris <- iris
- # report out every tenth iteration
- if(i %% 10 == 0) {print(paste0("iteration: ", i))}
- # capture labels in character vector, remove label from x-data
- myiris_labels <- myiris$Species
- myiris$Species <- NULL
- # identify train/test split
- indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
- indx_test <- setdiff(1:nrow(myiris), indx_train)
- # split features (x) into train and test
- myiris_train <- myiris[indx_train, ]
- myiris_test <- myiris[indx_test, ]
- # split labels (y) into train, test, and store a shuffled version of test y values as well
- y_train <- myiris_labels[indx_train]
- y_test <- myiris_labels[indx_test]
- y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]
- # build model based on real y values, then one based on shuffled y values
- myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
- myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)
- # make predictions based on real y values, then based on the model that saw shuffled y values
- myrf_preds <- predict(myrf, myiris_test)
- myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
- rm(myrf, myrf_shuffled)
- # determine accuracy of each model
- myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
- myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)
- # store the accuracy in the pre-allocated numeric vector space
- results[i] <- myrf_accuracy
- results_shuffled[i] <- myrf_accuracy_shuffled
- }
- # label which results came from which experiment and combine into one data frame
- df_results <- data.frame(accuracy=results, type='y_test')
- df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
- df_all <- rbind(df_results, df_results_shuffled)
- # plot the density distribution of each group
- ggplot(df_all, aes(x=accuracy, fill=type)) +
- geom_density(alpha=0.4) +
- theme_bw(base_size=16) +
- ggtitle("Model vs Target-Shuffled Model")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement