celestialgod

column merge and split with selecting columns

Jul 16th, 2016
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.88 KB | None | 0 0
  1. library(pipeR)
  2. library(plyr)
  3. library(data.table)
  4. library(dtplyr)
  5. library(dplyr)
  6. library(tidyr)
  7.  
  8. # data generation
  9. num_csv <- 3e3
  10. num_xlvls <- 30
  11. num_ylvls <- 30
  12. timePoints <- expand.grid(paste0("a", 1:num_xlvls),
  13.                           paste0("b", 1:num_ylvls),
  14.                           stringsAsFactors = FALSE)
  15. csv_files <- lapply(1:num_csv, function(i){
  16.   dat <- sample(1:nrow(timePoints), sample(400:600, 1)) %>>%
  17.     rep(each = 5) %>>% (timePoints[., ]) %>>%
  18.     modifyList(setNames(lapply(1:3, function(j){
  19.       sample(1:100, nrow(.), TRUE)
  20.     }), paste0("Var", 3:5))) %>>% tbl_df
  21. })
  22.  
  23. selectedCol <- c(3, 5)
  24.  
  25. st <- proc.time()
  26. outRes2 <- lapply(csv_files, function(subdf, pasteCols){
  27.   aggFml <- as.formula(paste0("~paste(", paste0(pasteCols, collapse = ","), ",sep=';')"))
  28.   tmpdf <- subdf %>>% mutate_(.dots = setNames(
  29.     list(lazyeval::interp(aggFml, .values = environment())), "tmp")) %>>%
  30.     group_by(Var1, Var2) %>>%
  31.     summarise(values = paste(tmp, collapse=';')) %>>% ungroup
  32.   numCols <- length(strsplit(tmpdf$values[1], ";")[[1]])
  33.   tmpdf %>>% separate(values, paste0("V", 1:numCols)) %>>%
  34.     mutate_each(funs(as.integer(.)), -Var1, -Var2)
  35. }, pasteCols = paste0("Var", selectedCol)) %>>% bind_rows
  36. proc.time() - st
  37. #  user  system elapsed
  38. # 66.91    0.09   67.75
  39.  
  40.  
  41. st <- proc.time()
  42. outRes3 <- lapply(csv_files, function(subdf, pasteCols){
  43.   evalStr <- paste0("paste(", paste0(pasteCols, collapse = ","), ",sep=';')")
  44.   tmpDT <- subdf %>>% data.table %>>% `[`( , tmp1 := eval(parse(text = evalStr))) %>>%
  45.     `[`( , list(tmp = paste(tmp1, collapse = ";")), by = c("Var1", "Var2"))
  46.   numCols <- length(strsplit(tmpDT$tmp[1], ";")[[1]])
  47.   tmpDT %>>% `[`( , `:=`(paste0("V", 1:numCols), tstrsplit(tmp, ";"))) %>>%
  48.     `[`( , tmp := NULL) %>>%
  49.     `[`( , `:=`(paste0("V", 1:numCols), lapply(.SD, as.integer)),
  50.          .SDcols = paste0("V", 1:numCols))
  51. }, pasteCols = paste0("Var", selectedCol)) %>>% rbindlist
  52. proc.time() - st
  53. #  user  system elapsed
  54. # 36.70    0.89   37.32
  55.  
  56. st <- proc.time()
  57. outRes4 <- lapply(csv_files, function(subdf, pasteCols){
  58.   aggFml <- as.formula(paste0("~paste(", paste0(pasteCols, collapse = ","), ",sep=';')"))
  59.   tmpdf <- subdf %>>% tbl_dt %>>% mutate_(.dots = setNames(
  60.     list(lazyeval::interp(aggFml, .values = environment())), "tmp")) %>>%
  61.     group_by(Var1, Var2) %>>% summarise(values = paste(tmp, collapse=';')) %>>% ungroup
  62.   numCols <- length(strsplit(tmpdf$values[1], ";")[[1]])
  63.   tmpdf %>>% separate(values, paste0("V", 1:numCols)) %>>%
  64.     mutate_each(funs(as.integer(.)), -Var1, -Var2)
  65. }, pasteCols = paste0("Var", selectedCol)) %>>% bind_rows
  66. proc.time() - st
  67. #  user  system elapsed
  68. # 55.94    1.34   57.13
  69.  
  70. # print out the versions of every used pkgs
  71. # sessionInfo()
  72. # R version 3.2.5 (2016-04-14)
  73. # Platform: x86_64-w64-mingw32/x64 (64-bit)
  74. # Running under: Windows 7 x64 (build 7601) Service Pack 1
  75. #
  76. # locale:
  77. # [1] LC_COLLATE=Chinese (Traditional)_Taiwan.950  LC_CTYPE=Chinese (Traditional)_Taiwan.950  
  78. # [3] LC_MONETARY=Chinese (Traditional)_Taiwan.950 LC_NUMERIC=C                                
  79. # [5] LC_TIME=Chinese (Traditional)_Taiwan.950    
  80. #
  81. # attached base packages:
  82. # [1] stats     graphics  grDevices utils     datasets  methods   base    
  83. #
  84. # other attached packages:
  85. # 1] tidyr_0.5.1.9000    data.table_1.9.7    dplyr_0.5.0.9000    dtplyr_0.0.1.9000  
  86. # [5] plyr_1.8.4.9000     pipeR_0.6.1.3       RevoUtilsMath_3.2.5
  87. #
  88. # loaded via a namespace (and not attached):
  89. # [1] Rcpp_0.12.5.2        assertthat_0.1.0.99  digest_0.6.9         withr_1.0.1        
  90. # [5] chron_2.3-47         R6_2.1.2             DBI_0.4-1            magrittr_1.5        
  91. # [9] git2r_0.14.0         httr_1.1.0           stringi_1.0-1        curl_0.9.7          
  92. # [13] lazyeval_0.1.10.9000 devtools_1.11.1      tools_3.2.5          memoise_1.0.0      
  93. # [17] knitr_1.12.3         tibble_1.0
Advertisement
Add Comment
Please, Sign In to add comment