celestialgod

Fastest Way to Add New Variables to A Large Data.Frame

Oct 31st, 2016
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.44 KB | None | 0 0
  1. pkgs <- list("hflights", "doParallel", "foreach", "dplyr", "data.table", "microbenchmark")
  2. sapply(pkgs, require, character.only = T)
  3.  
  4. data(hflights)
  5. hflights_DT <- data.table(hflights)
  6.  
  7. microbenchmark(
  8.   transform = {
  9.     ### THE GENERIC FUNCTION MODIFYING THE DATA.FRAME, SIMILAR TO DATA.FRAME() ###
  10.     transform(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'),
  11.               delay = ArrDelay + DepDelay)
  12.   },
  13.   within = {
  14.     ### EVALUATE THE EXPRESSION WITHIN THE LOCAL ENVIRONMENT ###
  15.     within(hflights, {wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday')
  16.     delay = ArrDelay + DepDelay})
  17.   },
  18.   mutate = {
  19.     ### THE SPECIFIC FUNCTION IN DPLYR PACKAGE TO ADD VARIABLES ###
  20.     mutate(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'),
  21.            delay = ArrDelay + DepDelay)
  22.   },
  23.   data.table = {
  24.     ### DATA.TABLE ###
  25.     hflights_DT[, `:=`(wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'),
  26.                        delay = ArrDelay + DepDelay)]
  27.   }
  28. )
  29. # Unit: milliseconds
  30. #        expr       min        lq      mean    median        uq      max neval
  31. #   transform 168.17041 178.67501 185.84942 183.13138 187.97160 231.0397   100
  32. #      within  68.20806  72.91385  81.53380  75.11322  83.05596 124.4741   100
  33. #      mutate  68.63053  72.61046  78.44092  74.66822  78.68297 127.4133   100
  34. #  data.table  68.29788  72.54419  78.61315  75.00599  78.44248 119.8358   100
Advertisement
Add Comment
Please, Sign In to add comment