Advertisement
SteveWeston

foreach/doParallel/PSOCK/data.table benchmark

Sep 15th, 2013
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 0.96 KB | None | 0 0
  1. suppressMessages(library(doParallel))
  2. library(itertools)
  3. library(data.table)
  4.  
  5. set.seed(107)
  6. n <- 1000000
  7. m <- 10000
  8. td <- data.frame(val=rnorm(n), id=sample(m, n, replace=TRUE))
  9.  
  10. cl <- makePSOCKcluster(4)
  11. registerDoParallel(cl)
  12. workers <- getDoParWorkers()
  13.  
  14. vadd <- function(a, ...) {
  15.   for (v in list(...))
  16.     a <- a + v
  17.   a
  18. }
  19. ownermean <- function(v, mine) if (mine) mean(v) else 0
  20.  
  21. start <- proc.time()[3]
  22. nuniq <- length(unique(td$id))
  23. DT <- data.table(td)
  24. res <- foreach(grps=isplitIndices(nuniq, chunks=workers),
  25.                .combine='vadd',
  26.                .multicombine=TRUE,
  27.                .inorder=FALSE,
  28.                .packages='data.table') %dopar% {
  29.   DT[, means := ownermean(DT[-.I, val], .GRP %in% grps), by=id]
  30.   DT$means
  31. }
  32. elapsed <- proc.time()[3] - start
  33.  
  34. library(digest)
  35. cat(sprintf("foreach/doParallel/PSOCK/data.table with %d workers:\n", workers))
  36. cat(sprintf("Elapsed time: %f, MD5 hash: %s\n", elapsed, digest(res)))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement