Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2017
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.01 KB | None | 0 0
  1. #' Target Shuffling
  2. #' Author: Taylor Van Anne
  3. #'
  4. #' Note: this is just my interpretation of what target shuffling means
  5. #' to me. I think there are a few different ways to actually conduct
  6. #' the shuffling, but this is a single approach.
  7. #'
  8. #' A different approach than what I did here would be to shuffle the
  9. #' entire target variable before the train/test split. I chose to
  10. #' instead shuffle only within the test label values (after splitting
  11. #' the label values into train/test)
  12.  
  13.  
  14. # load libraries
  15. library(randomForest)
  16. library(ggplot2)
  17.  
  18. # this is the number of iterations of model building
  19. num_iters <- 100
  20.  
  21. # allocating numeric vector space to store our results as we loop
  22. results <- numeric(num_iters)
  23. results_shuffled <- numeric(num_iters)
  24.  
  25. # setting a random seed for reproducibility
  26. set.seed(4)
  27.  
  28.  
  29. # begin the loop:
  30. for(i in 1:num_iters) {
  31.  
  32. # replicate the iris data
  33. myiris <- iris
  34.  
  35. # report out every tenth iteration
  36. if(i %% 10 == 0) {print(paste0("iteration: ", i))}
  37.  
  38. # capture labels in character vector, remove label from x-data
  39. myiris_labels <- myiris$Species
  40. myiris$Species <- NULL
  41.  
  42. # identify train/test split
  43. indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
  44. indx_test <- setdiff(1:nrow(myiris), indx_train)
  45.  
  46. # split features (x) into train and test
  47. myiris_train <- myiris[indx_train, ]
  48. myiris_test <- myiris[indx_test, ]
  49.  
  50. # split labels (y) into train, test, and store a shuffled version of test y values as well
  51. y_train <- myiris_labels[indx_train]
  52. y_test <- myiris_labels[indx_test]
  53. y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]
  54.  
  55. # build model based on real y values, then one based on shuffled y values
  56. myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
  57. myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)
  58.  
  59. # make predictions based on real y values, then based on the model that saw shuffled y values
  60. myrf_preds <- predict(myrf, myiris_test)
  61. myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
  62. rm(myrf, myrf_shuffled)
  63.  
  64. # determine accuracy of each model
  65. myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
  66. myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)
  67.  
  68. # store the accuracy in the pre-allocated numeric vector space
  69. results[i] <- myrf_accuracy
  70. results_shuffled[i] <- myrf_accuracy_shuffled
  71. }
  72.  
  73.  
  74. # label which results came from which experiment and combine into one data frame
  75. df_results <- data.frame(accuracy=results, type='y_test')
  76. df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
  77. df_all <- rbind(df_results, df_results_shuffled)
  78.  
  79.  
  80. # plot the density distribution of each group
  81. ggplot(df_all, aes(x=accuracy, fill=type)) +
  82. geom_density(alpha=0.4) +
  83. theme_bw(base_size=16) +
  84. ggtitle("Model vs Target-Shuffled Model")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement