celestialgod

grouping

May 21st, 2015
466
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.81 KB | None | 0 0
  1. # vectorise
  2. group_f1 = function(data){
  3.     new_col = cut(X[,8], c(-Inf, 0.25, 0.5, 0.75, Inf))
  4.     levels(new_col) <- paste0("group", 1:4)
  5.     cbind(data, new_col)
  6. }
  7.  
  8. # for loop
  9. group_f2 = function(data){
  10.     new_col = rep(NA, nrow(data))
  11.     for (i in 1:length(new_col)){
  12.       if (data[i,8] < .25){
  13.         new_col[i] = "group1"
  14.       } else if (data[i,8] < .5){
  15.         new_col[i] = "group2"
  16.       } else if (data[i,8] < .75){
  17.         new_col[i] = "group3"
  18.       } else {
  19.         new_col[i] = "group4"
  20.       }
  21.     }
  22.     cbind(data, new_col)
  23. }
  24.  
  25. # sapply
  26. group_f3 = function(data){
  27.     new_col = sapply(1:nrow(data), function(i){
  28.       if (data[i,8] < .25){
  29.         "group1"
  30.       } else if (data[i,8] < .5){
  31.         "group2"
  32.       } else if (data[i,8] < .75){
  33.         "group3"
  34.       } else {
  35.         "group4"
  36.       }
  37.     })
  38.     cbind(data, new_col)
  39. }
  40.  
  41. # plyr and dplyr
  42. library(data.table)
  43. library(plyr)
  44. library(dplyr)
  45. library(magrittr)
  46. group_f4 = function(data){
  47.   data %>% mutate(new_col = cut(X8, c(-Inf, 0.25, 0.5, 0.75, Inf))) %>%
  48.     transform(new_col = mapvalues(new_col, from = levels(new_col),
  49.       to = paste0("group", 1:4)))
  50. }
  51.  
  52. library(rbenchmark)
  53. ## data generation
  54. X = sapply((1:10-5.5)*4/20, rnorm, n = 200000)
  55. X_df = data.frame(X) %>% tbl_df()
  56. X_dt = data.table(X_df) %>% tbl_dt(FALSE)
  57. benchmark(
  58.   group_f1(X_df), group_f2(X_df), group_f3(X_df),
  59.   group_f4(X_df), group_f4(X_dt),
  60.   columns = c("test", "replications", "elapsed", "relative", "user.self"),
  61.   order = "relative", replications = 20
  62. )
  63. #             test replications elapsed relative user.self
  64. # 1 group_f1(X_df)           20    0.96    1.000      0.95
  65. # 4 group_f4(X_df)           20    0.98    1.021      0.97
  66. # 5 group_f4(X_dt)           20    1.23    1.281      1.20
  67. # 2 group_f2(X_df)           20  417.14  434.521    411.56
  68. # 3 group_f3(X_df)           20  418.21  435.635    412.50
Advertisement
Add Comment
Please, Sign In to add comment