Untitled

#' Target Shuffling
#' Author: Taylor Van Anne
#'
#' Note: this is just my interpretation of what target shuffling means
#' to me. I think there are a few different ways to actually conduct
#' the shuffling, but this is a single approach.
#'
#' A different approach than what I did here would be to shuffle the
#' entire target variable before the train/test split. I chose to
#' instead shuffle only within the test label values (after splitting
#' the label values into train/test)


# load libraries
library(randomForest)
library(ggplot2)

# this is the number of iterations of model building
num_iters <- 100

# allocating numeric vector space to store our results as we loop
results <- numeric(num_iters)
results_shuffled <- numeric(num_iters)

# setting a random seed for reproducibility
set.seed(4)


# begin the loop:
for(i in 1:num_iters) {

    # replicate the iris data
    myiris <- iris

    # report out every tenth iteration
    if(i %% 10 == 0) {print(paste0("iteration: ", i))}

    # capture labels in character vector, remove label from x-data
    myiris_labels <- myiris$Species
    myiris$Species <- NULL

    # identify train/test split
    indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
    indx_test <- setdiff(1:nrow(myiris), indx_train)

    # split features (x) into train and test
    myiris_train <- myiris[indx_train, ]
    myiris_test <- myiris[indx_test, ]

    # split labels (y) into train, test, and store a shuffled version of test y values as well
    y_train <- myiris_labels[indx_train]
    y_test <- myiris_labels[indx_test]
    y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]

    # build model based on real y values, then one based on shuffled y values
    myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
    myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)

    # make predictions based on real y values, then based on the model that saw shuffled y values
    myrf_preds <- predict(myrf, myiris_test)
    myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
    rm(myrf, myrf_shuffled)

    # determine accuracy of each model
    myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
    myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)

    # store the accuracy in the pre-allocated numeric vector space
    results[i] <- myrf_accuracy
    results_shuffled[i] <- myrf_accuracy_shuffled
}


# label which results came from which experiment and combine into one data frame
df_results <- data.frame(accuracy=results, type='y_test')
df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
df_all <- rbind(df_results, df_results_shuffled)


# plot the density distribution of each group
ggplot(df_all, aes(x=accuracy, fill=type)) +
    geom_density(alpha=0.4) +
    theme_bw(base_size=16) +
    ggtitle("Model vs Target-Shuffled Model")