merge with function aggregation

library(data.table)
library(plyr)
library(dplyr)
library(tidyr)
library(purrr)
library(magrittr)
library(Rcpp)

run_plyr_method = FALSE

x <- matrix(sample(1e6), 1e5) %>% data.table() %>%
     setnames(1:10,sample(LETTERS,10)) %>% .[,SP:=seq_len(nrow(.))]
y <- matrix(sample(1e5), 1e4) %>% data.table() %>%
     setnames(1:10,sample(LETTERS,10)) %>% .[,SP:=seq_len(nrow(.))]
z <- matrix(sample(4e5), 2e4) %>% data.table() %>%
     setnames(1:20,sample(LETTERS,20)) %>% .[,SP:=seq_len(nrow(.))]

cppFunction('
NumericVector addv(NumericVector x, NumericVector y) {
  NumericVector out(x.size());
  NumericVector::iterator x_it,y_it,out_it;
  for (x_it = x.begin(), y_it=y.begin(), out_it = out.begin();
       x_it != x.end(); ++x_it, ++y_it, ++out_it) {
    if (ISNA(*x_it)) {
      *out_it = *y_it;
    } else if (ISNA(*y_it)) {
      *out_it = *x_it;
    } else {
      *out_it = *x_it + *y_it;
    }
  }
  return out;
}')

outer_join2 <- function (df1,df2,byNames) {
  tt = intersect(colnames(df1)[-match(byNames,colnames(df1))],
               colnames(df2)[-match(byNames,colnames(df2))])
  if (length(tt) > 0)
  {
    df <- merge(df2, df1 %>% select_(.dots = paste0('-', tt)), by=byNames, all=TRUE)
    dt <- merge(df2 %>% select_(.dots = paste0('-', tt)),
        df1 %>% select_(.dots = c(byNames, tt)),by=byNames,all=T) %>%
          select_(.dots = tt)
    for (j in colnames(dt)) {set(df,j=j,value=addv(df[[j]],dt[[j]]))}
  } else
  {
    df = merge(df2, df1, by=byNames, all=TRUE)
  }
  return (df)
}

t = proc.time()
out1 = Reduce(function(x, y) outer_join2(x, y, byNames="SP"), list(x,y,z)) %>% tbl_dt(FALSE)
proc.time() - t
#   user  system elapsed
#   0.36    0.03    0.40

sourceCpp(code = '
#include <Rcpp.h>
#include <string>
#include <vector>
using namespace Rcpp;
using namespace std;

// [[Rcpp::export]]
List aggregate_merge_cpp(DataFrame df1, DataFrame df2,
  vector<string> names_merge, Function aggregate_f){
  List outputList(names_merge.size());
  for (size_t i = 0; i < names_merge.size(); i++)
  {
    switch ( TYPEOF(df1[names_merge[i]]) ) {
      case REALSXP: {
        NumericVector tmp1 = as<NumericVector>(df1[names_merge[i]]),
          tmp2 = as<NumericVector>(df2[names_merge[i]]);
        outputList[i] = aggregate_f(tmp1, tmp2);
        break;
      }
      case INTSXP: {
        IntegerVector tmp1 = as<IntegerVector>(df1[names_merge[i]]),
          tmp2 = as<IntegerVector>(df2[names_merge[i]]);
        outputList[i] = aggregate_f(tmp1, tmp2);
        break;
      }
      default:
        stop("unsupported data type");
    }
  }
  outputList.attr("names") = names_merge;
  return outputList;
}');

aggregate_merge <- function(x, y, byNames, aggregate_f = magrittr:::add){
  if (!"data.table" %in% class(x))
    x %<>% tbl_dt(FALSE)
  if (!"data.table" %in% class(y))
    y %<>% tbl_dt(FALSE)

  aggregate_names = setdiff(intersect(names(x), names(y)), byNames)
  if ( length(aggregate_names) > 0)
  {
    x_index = x %>% select_(.dots = byNames)
    y_index = y %>% select_(.dots = byNames)
    indecies = bind_rows(x_index, y_index)
    combine_rows = which(duplicated(indecies))
    indecies %<>% filter(1:nrow(.) %in% combine_rows)

    if (length(byNames) >= 2)
    {
      stop("This part is undone for merging byNames whose length is greater than 2.")
    } else
    {
      x_index = match(indecies[[byNames]], x_index[[byNames]])
      y_index = match(indecies[[byNames]], y_index[[byNames]])
    }
    aggregate_dt = aggregate_merge_cpp(
      x %>% select_(.dots = aggregate_names) %>% filter(1:nrow(.) %in% x_index) %>% replace(is.na(.), as.integer(0)),
      y %>% select_(.dots = aggregate_names) %>% filter(1:nrow(.) %in% y_index) %>% replace(is.na(.), as.integer(0)),
      aggregate_names, aggregate_f) %>% as_data_frame %>%
      bind_cols(x %>% select_(.dots = byNames) %>% filter(1:nrow(.) %in% x_index)) %>%
      bind_rows(
        x %>% select_(.dots = c(byNames, aggregate_names)) %>% filter(!(1:nrow(.) %in% x_index)),
        y %>% select_(.dots = c(byNames, aggregate_names)) %>% filter(!(1:nrow(.) %in% y_index))) %>%
      tbl_dt(FALSE) %>% arrange_(.dots = byNames) %>% select_(.dots = paste0("-", byNames))
    out_dt = merge(x %>% select_(.dots = setdiff(names(.), aggregate_names)),
          y %>% select_(.dots = setdiff(names(.), aggregate_names)),
          by = byNames, all = TRUE) %>% arrange_(.dots = byNames) %>%
       bind_cols(aggregate_dt) %>% tbl_dt(FALSE)
  }
  else
    out_dt = merge(x, y, by = byNames, all = TRUE)
  out_dt
}
t = proc.time()
out2 = Reduce(function(df1, df2) aggregate_merge(df1, df2, byNames="SP"), list(x,y,z))
proc.time() - t
#   user  system elapsed
#   0.31    0.00    0.32

t = proc.time()
wide_table = rbind.fill(list(x, y, z)) %>% tbl_dt(FALSE)
sum_without_na = function(x) ifelse(all(is.na(x)), NA_integer_, sum(x, na.rm = TRUE))
out3 = wide_table %>% group_by(SP) %>% summarise_each(funs(sum_without_na))
proc.time() - t
#   user  system elapsed
#   8.61    0.00    8.66

t = proc.time()
out4 = list(x, y, z) %>% llply(function(x){
  gather(x, variable, values, -SP) %>% mutate(variable = as.character(variable))
  }) %>% bind_rows %>% tbl_dt(FALSE) %>% group_by(SP, variable) %>%
  summarise(values = sum(values)) %>% spread(variable, values)
proc.time() - t
#   user  system elapsed
#   1.06    0.12    1.19

if (run_plyr_method)
{
  t = proc.time()
  wide_table = rbind.fill(list(x, y, z)) %>% tbl_dt(FALSE)
  out5 = ddply(wide_table, .(SP), function(x) colSums(x, na.rm = TRUE)) %>% tbl_dt(FALSE)
  proc.time() - t
#   user  system elapsed
#  49.65    0.05   50.05
}

out1 %<>% select_(.dots = names(out4)) %>% arrange(SP) %>% mutate_each(funs(as.integer))
out2 %<>% select_(.dots = names(out4)) %>% arrange(SP)
out3 %<>% select_(.dots = names(out4)) %>% arrange(SP)
out4 %<>% arrange(SP)
if (run_plyr_method)
  out5 %<>% arrange(SP)

all.equal(out4, out1) # TRUE
all.equal(out4, out2) # TRUE
all.equal(out4, out3) # TRUE
if (run_plyr_method)
  all.equal(out4, out5) # TRUE