Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(dplyr)
- library(purrr)
- library(magrittr)
- aggregate_merge <- function(..., by){
- dfs = list(...)
- overall_keys = dfs %>% map(~select_(., .dots = by)) %>% rbindlist %>% distinct
- dfs %<>% map(~merge(., overall_keys, by = by, all = TRUE))
- duplicated_cols = dfs %>% map(~names(.)) %>% do.call(c, .) %>%
- .[duplicated(.)] %>% setdiff(by)
- tmp = llply(dfs, function(x) llply(1:length(duplicated_cols), function(i){
- x[[duplicated_cols[i]]]
- })) %>% zip_n %>% map(~do.call(cbind, .))
- tmp2 = tmp %>% map(~which(rowMeans(is.na(.)) == 1))
- tmp %<>% map(~rowSums(., na.rm = TRUE)) %>%
- set_names(duplicated_cols) %>% as_data_frame %>% tbl_dt(FALSE) %>%
- bind_cols(overall_keys) %>% tbl_dt(FALSE)
- for (i in 1:length(tmp2))
- {
- if (length(tmp2[[i]]) > 0)
- set(tmp, i = tmp2[[i]], j = i, value = NA)
- }
- bind_dfs_length = llply(dfs, function(x) setdiff(names(x), c(by, duplicated_cols))) %>% map_int(~length(.))
- dfs[bind_dfs_length > 0] %>% map(~select_(., .dots = setdiff(names(.), c(by, duplicated_cols)))) %>%
- bind_cols %>% tbl_dt(FALSE) %>% bind_cols(overall_keys) %>% tbl_dt(FALSE) %>%
- merge(tmp, by = by)
- }
- t = proc.time()
- out6 = aggregate_merge(x, y, z, by = "SP")
- proc.time() - t
- # user system elapsed
- # 0.24 0.08 0.31
- out6 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each(funs(as.integer))
- all.equal(out4, out6) # TRUE
Advertisement
Add Comment
Please, Sign In to add comment