Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(plyr)
- library(dplyr)
- library(tidyr)
- library(purrr)
- library(magrittr)
- library(Rcpp)
- library(RcppArmadillo)
- run_dplyr_method = TRUE
- run_plyr_method = TRUE
- w <- matrix(sample(2e6), 1e5) %>% data.table() %>%
- setnames(1:20,sample(LETTERS,20)) %>% .[,SP:=seq_len(nrow(.))]
- x <- matrix(sample(1e6), 1e5) %>% data.table() %>%
- setnames(1:10,sample(LETTERS,10)) %>% .[,SP:=seq_len(nrow(.))]
- y <- matrix(sample(1e5), 1e4) %>% data.table() %>%
- setnames(1:10,sample(LETTERS,10)) %>% .[,SP:=seq_len(nrow(.))]
- z <- matrix(sample(4e5), 2e4) %>% data.table() %>%
- setnames(1:20,sample(LETTERS,20)) %>% .[,SP:=seq_len(nrow(.))]
- cppFunction('
- NumericVector addv(NumericVector x, NumericVector y) {
- NumericVector out(x.size());
- NumericVector::iterator x_it,y_it,out_it;
- for (x_it = x.begin(), y_it=y.begin(), out_it = out.begin();
- x_it != x.end(); ++x_it, ++y_it, ++out_it) {
- if (ISNA(*x_it)) {
- *out_it = *y_it;
- } else if (ISNA(*y_it)) {
- *out_it = *x_it;
- } else {
- *out_it = *x_it + *y_it;
- }
- }
- return out;
- }')
- outer_join2 <- function(df1,df2,byNames) {
- tt = intersect(colnames(df1)[-match(byNames,colnames(df1))],
- colnames(df2)[-match(byNames,colnames(df2))])
- if (length(tt) > 0)
- {
- df <- merge(df2, df1 %>% select_(.dots = paste0('-', tt)), by=byNames, all=TRUE)
- dt <- merge(df2 %>% select_(.dots = paste0('-', tt)),
- df1 %>% select_(.dots = c(byNames, tt)),by=byNames,all=T) %>%
- select_(.dots = tt)
- for (j in colnames(dt)) {set(df,j=j,value=addv(df[[j]],dt[[j]]))}
- } else
- {
- df = merge(df2, df1, by=byNames, all=TRUE)
- }
- return (df)
- }
- t = proc.time()
- out1 = Reduce(function(df1, df2) outer_join2(df1, df2, byNames="SP"), list(w,x,y,z)) %>% tbl_dt(FALSE)
- proc.time() - t
- # user system elapsed
- # 0.34 0.07 0.40
- sourceCpp(code = '
- #include <Rcpp.h>
- #include <string>
- #include <vector>
- using namespace Rcpp;
- using namespace std;
- // [[Rcpp::export]]
- List aggregate_merge_cpp(DataFrame df1, DataFrame df2,
- vector<string> names_merge, Function aggregate_f){
- List outputList(names_merge.size());
- for (size_t i = 0; i < names_merge.size(); i++)
- {
- switch ( TYPEOF(df1[names_merge[i]]) ) {
- case REALSXP: {
- NumericVector tmp1 = as<NumericVector>(df1[names_merge[i]]),
- tmp2 = as<NumericVector>(df2[names_merge[i]]);
- outputList[i] = aggregate_f(tmp1, tmp2);
- break;
- }
- case INTSXP: {
- IntegerVector tmp1 = as<IntegerVector>(df1[names_merge[i]]),
- tmp2 = as<IntegerVector>(df2[names_merge[i]]);
- outputList[i] = aggregate_f(tmp1, tmp2);
- break;
- }
- default:
- stop("unsupported data type");
- }
- }
- outputList.attr("names") = names_merge;
- return outputList;
- }');
- aggregate_merge <- function(x, y, byNames, aggregate_f = magrittr:::add){
- if (!"data.table" %in% class(x))
- x %<>% tbl_dt(FALSE)
- if (!"data.table" %in% class(y))
- y %<>% tbl_dt(FALSE)
- aggregate_names = setdiff(intersect(names(x), names(y)), byNames)
- if ( length(aggregate_names) > 0)
- {
- x_index = x %>% select_(.dots = byNames)
- y_index = y %>% select_(.dots = byNames)
- indecies = bind_rows(x_index, y_index)
- combine_rows = which(duplicated(indecies))
- indecies %<>% filter(1:nrow(.) %in% combine_rows)
- if (length(byNames) >= 2)
- {
- stop("This part is undone for merging byNames whose length is greater than 2.")
- } else
- {
- x_index = match(indecies[[byNames]], x_index[[byNames]])
- y_index = match(indecies[[byNames]], y_index[[byNames]])
- }
- aggregate_dt = aggregate_merge_cpp(
- x %>% select_(.dots = aggregate_names) %>% filter(1:nrow(.) %in% x_index) %>% replace(is.na(.), as.integer(0)),
- y %>% select_(.dots = aggregate_names) %>% filter(1:nrow(.) %in% y_index) %>% replace(is.na(.), as.integer(0)),
- aggregate_names, aggregate_f) %>% as_data_frame %>%
- bind_cols(x %>% select_(.dots = byNames) %>% filter(1:nrow(.) %in% x_index)) %>%
- bind_rows(
- x %>% select_(.dots = c(byNames, aggregate_names)) %>% filter(!(1:nrow(.) %in% x_index)),
- y %>% select_(.dots = c(byNames, aggregate_names)) %>% filter(!(1:nrow(.) %in% y_index))) %>%
- tbl_dt(FALSE) %>% arrange_(.dots = byNames) %>% select_(.dots = paste0("-", byNames))
- out_dt = merge(x %>% select_(.dots = setdiff(names(.), aggregate_names)),
- y %>% select_(.dots = setdiff(names(.), aggregate_names)),
- by = byNames, all = TRUE) %>% arrange_(.dots = byNames) %>%
- bind_cols(aggregate_dt) %>% tbl_dt(FALSE)
- }
- else
- out_dt = merge(x, y, by = byNames, all = TRUE)
- out_dt
- }
- t = proc.time()
- out2 = Reduce(function(df1, df2) aggregate_merge(df1, df2, byNames="SP"), list(w,x,y,z))
- proc.time() - t
- # user system elapsed
- # 0.56 0.03 0.59
- if (run_dplyr_method)
- {
- t = proc.time()
- wide_table = rbind.fill(list(w, x, y, z)) %>% tbl_dt(FALSE)
- sum_without_na = function(vec) ifelse(all(is.na(vec)), NA_integer_, sum(vec, na.rm = TRUE))
- out3 = wide_table %>% group_by(SP) %>% summarise_each(funs(sum_without_na))
- proc.time() - t
- }
- # user system elapsed
- # 5.77 0.03 5.80
- t = proc.time()
- out4 = list(w, x, y, z) %>% llply(function(dt){
- gather(dt, variable, values, -SP) %>% mutate(variable = as.character(variable))
- }) %>% bind_rows %>% tbl_dt(FALSE) %>% group_by(SP, variable) %>%
- summarise(values = sum(values)) %>% spread(variable, values)
- proc.time() - t
- # user system elapsed
- # 1.36 0.08 1.44
- if (run_plyr_method)
- {
- t = proc.time()
- wide_table = rbind.fill(list(w, x, y, z)) %>% tbl_dt(FALSE)
- out5 = ddply(wide_table, .(SP), function(mat) colSums(mat, na.rm = TRUE)) %>% tbl_dt(FALSE)
- proc.time() - t
- # user system elapsed
- # 28.97 0.12 29.09
- }
- aggregate_merge <- function(..., by){
- dfs = list(...)
- overall_keys = dfs %>% map(~select_(., .dots = by)) %>% rbindlist %>% distinct
- dfs %<>% map(~merge(., overall_keys, by = by, all = TRUE))
- duplicated_cols = dfs %>% map(~names(.)) %>% do.call(c, .) %>%
- .[duplicated(.)] %>% setdiff(by)
- tmp = llply(dfs, function(dt) llply(1:length(duplicated_cols), function(i){
- dt[[duplicated_cols[i]]]
- })) %>% zip_n %>% map(~do.call(cbind, .))
- tmp2 = tmp %>% map(~which(rowMeans(is.na(.)) == 1))
- tmp %<>% map(~rowSums(., na.rm = TRUE)) %>%
- set_names(duplicated_cols) %>% as_data_frame %>% tbl_dt(FALSE) %>%
- bind_cols(overall_keys) %>% tbl_dt(FALSE)
- for (i in 1:length(tmp2))
- {
- if (length(tmp2[[i]]) > 0)
- set(tmp, i = tmp2[[i]], j = i, value = NA)
- }
- bind_dfs_length = llply(dfs, function(dt) setdiff(names(dt), c(by, duplicated_cols))) %>% map_int(~length(.))
- dfs[bind_dfs_length > 0] %>% map(~select_(., .dots = setdiff(names(.), c(by, duplicated_cols)))) %>%
- bind_cols %>% tbl_dt(FALSE) %>% bind_cols(overall_keys) %>% tbl_dt(FALSE) %>%
- merge(tmp, by = by)
- }
- t = proc.time()
- out6 = aggregate_merge(w, x, y, z, by = "SP")
- proc.time() - t
- # user system elapsed
- # 0.36 0.02 0.38
- sourceCpp(code = '
- #include <Rcpp.h>
- #include <string>
- #include <vector>
- using namespace Rcpp;
- using namespace std;
- NumericVector add_narm(NumericVector x, NumericVector y) {
- NumericVector out(x.size());
- NumericVector::iterator x_it,y_it,out_it;
- for (x_it = x.begin(), y_it=y.begin(), out_it = out.begin();
- x_it != x.end(); ++x_it, ++y_it, ++out_it) {
- if (ISNA(*x_it)) {
- *out_it = *y_it;
- } else if (ISNA(*y_it)) {
- *out_it = *x_it;
- } else {
- *out_it = *x_it + *y_it;
- }
- }
- return out;
- }
- // [[Rcpp::export]]
- DataFrame merge_all_cpp(List dfs){
- DataFrame out(dfs[0]);
- vector<string> df_names = out.names();
- for (int i = 1; i < dfs.length(); i++)
- {
- DataFrame tmp(dfs[i]);
- vector<string> tmp_names = tmp.names();
- for (int j = 0; j < tmp.length(); j++)
- {
- if (find(df_names.begin(), df_names.end(), tmp_names[j]) != df_names.end())
- {
- out[tmp_names[j]] = add_narm(
- as<NumericVector>(out[tmp_names[j]]),
- as<NumericVector>(tmp[tmp_names[j]]));
- }
- else
- {
- out.push_back(tmp[tmp_names[j]], tmp_names[j]);
- df_names.push_back(tmp_names[j]);
- }
- }
- }
- return out;
- }');
- t = proc.time()
- dfs = list(w,x,y,z)
- by = "SP"
- overall_keys = list(w,x,y,z) %>% map(~select_(., .dots = by)) %>% rbindlist %>% distinct
- dfs %<>% map(~merge(., overall_keys, by = by, all = TRUE))
- ## not use purrr
- # overall_keys = dfs %>% lapply(function(dt) select_(x, .dots = by)) %>% rbindlist %>% distinct
- # dfs %<>% lapply(function(dt) merge(x, overall_keys, by = by, all = TRUE))
- f = function(var) var / length(dfs)
- out7 = merge_all_cpp(dfs) %>% tbl_dt(FALSE) %>% mutate_each_(funs(f), by)
- proc.time() - t
- # user system elapsed
- # 0.35 0.00 0.35
- sourceCpp(code = '
- // [[Rcpp::depends(RcppArmadillo)]]
- #include <RcppArmadillo.h>
- #include <string>
- #include <vector>
- #include <algorithm>
- using namespace Rcpp;
- using namespace std;
- using namespace arma;
- // [[Rcpp::export]]
- NumericVector add_narm(SEXP xs, SEXP ys){
- NumericVector xr(xs);
- NumericVector yr(ys);
- colvec x(xr.begin(), xr.size(), false);
- colvec y(yr.begin(), yr.size(), false);
- uvec loc_na_x = find_nonfinite(x);
- uvec loc_na_y = find_nonfinite(y);
- colvec z = x + y;
- vector<uword> loc_na_std;
- set_intersection(loc_na_x.begin(), loc_na_x.end(),
- loc_na_y.begin(), loc_na_y.end(),
- back_inserter(loc_na_std));
- uvec loc_na = conv_to<uvec>::from(loc_na_std);
- x.elem(loc_na_x).zeros();
- y.elem(loc_na_y).zeros();
- z = x + y;
- z.elem(loc_na).fill(NA_REAL);
- return wrap(z);
- }
- // [[Rcpp::export]]
- DataFrame merge_all_cpp_v2(List dfs){
- DataFrame out(dfs[0]);
- vector<string> df_names = out.names();
- for (int i = 1; i < dfs.length(); i++)
- {
- DataFrame tmp(dfs[i]);
- vector<string> tmp_names = tmp.names();
- for (int j = 0; j < tmp.length(); j++)
- {
- if (find(df_names.begin(), df_names.end(), tmp_names[j]) != df_names.end())
- out[tmp_names[j]] = add_narm(out[tmp_names[j]], tmp[tmp_names[j]]);
- else
- {
- out.push_back(tmp[tmp_names[j]], tmp_names[j]);
- df_names.push_back(tmp_names[j]);
- }
- }
- }
- return out;
- }')
- t = proc.time()
- dfs = list(w,x,y,z)
- by = "SP"
- overall_keys = dfs %>% map(~select_(., .dots = by)) %>% rbindlist %>% distinct
- dfs %<>% map(~merge(., overall_keys, by = by, all = TRUE))
- ## not use purrr
- # overall_keys = dfs %>% lapply(function(dt) select_(dt, .dots = by)) %>% rbindlist %>% distinct
- # dfs %<>% lapply(function(dt) merge(dt, overall_keys, by = by, all = TRUE))
- f = function(var) var / length(dfs)
- out8 = merge_all_cpp_v2(dfs) %>% tbl_dt(FALSE) %>% mutate_each_(funs(f), by)
- proc.time() - t
- # user system elapsed
- # 0.11 0.00 0.11
- out1 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each(funs(as.integer))
- out2 %<>% select_(.dots = names(out4)) %>% arrange(SP)
- if (run_dplyr_method)
- out3 %<>% select_(.dots = names(out4)) %>% arrange(SP)
- out4 %<>% arrange(SP)
- if (run_plyr_method)
- out5 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each_(funs(f), by) %>% mutate_each(funs(as.integer))
- out6 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each(funs(as.integer))
- out7 %<>% select_(.dots = names(out4)) %>%
- arrange_(.dots = by) %>% mutate_each(funs(as.integer))
- out8 %<>% select_(.dots = names(out4)) %>%
- arrange_(.dots = by) %>% mutate_each(funs(as.integer))
- all.equal(out4, out1) # TRUE
- all.equal(out4, out2) # TRUE
- if (run_dplyr_method)
- all.equal(out4, out3) # TRUE
- if (run_plyr_method)
- all.equal(out4, out5) # it has a error for NA + NA = 0
- all.equal(out4, out6) # TRUE
- all.equal(out4, out7) # TRUE
- all.equal(out4, out8) # TRUE
Advertisement
Add Comment
Please, Sign In to add comment