celestialgod

faster merge with function aggregation

Oct 13th, 2015
310
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.42 KB | None | 0 0
  1. library(data.table)
  2. library(dplyr)
  3. library(purrr)
  4. library(magrittr)
  5.  
  6. aggregate_merge <- function(..., by){
  7.   dfs = list(...)
  8.   overall_keys = dfs %>% map(~select_(., .dots = by)) %>% rbindlist %>% distinct
  9.   dfs %<>% map(~merge(., overall_keys, by = by, all = TRUE))
  10.   duplicated_cols = dfs %>% map(~names(.)) %>% do.call(c, .) %>%
  11.     .[duplicated(.)] %>% setdiff(by)
  12.   tmp = llply(dfs, function(x) llply(1:length(duplicated_cols), function(i){
  13.     x[[duplicated_cols[i]]]
  14.   })) %>% zip_n %>% map(~do.call(cbind, .))
  15.   tmp2 = tmp %>% map(~which(rowMeans(is.na(.)) == 1))
  16.   tmp %<>% map(~rowSums(., na.rm = TRUE)) %>%
  17.     set_names(duplicated_cols) %>% as_data_frame %>% tbl_dt(FALSE) %>%
  18.     bind_cols(overall_keys) %>% tbl_dt(FALSE)
  19.   for (i in 1:length(tmp2))
  20.   {
  21.     if (length(tmp2[[i]]) > 0)
  22.       set(tmp, i = tmp2[[i]], j = i, value = NA)
  23.   }
  24.  
  25.   bind_dfs_length = llply(dfs, function(x) setdiff(names(x), c(by, duplicated_cols))) %>% map_int(~length(.))
  26.   dfs[bind_dfs_length > 0] %>% map(~select_(., .dots = setdiff(names(.), c(by, duplicated_cols)))) %>%
  27.     bind_cols %>% tbl_dt(FALSE) %>% bind_cols(overall_keys) %>% tbl_dt(FALSE) %>%
  28.     merge(tmp, by = by)
  29. }
  30.  
  31. t = proc.time()
  32. out6 = aggregate_merge(x, y, z, by = "SP")
  33. proc.time() - t
  34. #   user  system elapsed
  35. #   0.24    0.08    0.31
  36.  
  37. out6 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each(funs(as.integer))
  38. all.equal(out4, out6) # TRUE
Advertisement
Add Comment
Please, Sign In to add comment