Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ts <- lapply(c(1:500), function(x) seq(as.POSIXct("2000/1/1"), as.POSIXct("2017/1/1"), "hours"))
- t <- unique(do.call("c", ts))
- t <- as.POSIXct(unique(unlist(ts, use.names = FALSE)), origin = "1970-01-01")
- t <- lapply(split(ts, ceiling(seq_along(ts)/30)), function(x) {
- return(unique(unlist(x, use.names = FALSE)))
- })
- t <- unique(unlist(x, use.names = FALSE))
- ff1 = function(x) ## a simple version of `Reduce(unique(c()), )`
- {
- ans = NULL
- for(elt in x) ans = unique(c(ans, elt))
- return(.POSIXct(ans))
- }
- system.time({ ans1 = ff1(ts) })
- # user system elapsed
- # 11.41 1.25 12.74
- ff2 = function(x)
- {
- ans = NULL
- for(elt in x) {
- new = !(elt %in% ans)
- if(any(new)) ans = c(ans, elt[new])
- }
- return(.POSIXct(ans))
- }
- system.time({ ans2 = ff2(ts) })
- # user system elapsed
- # 6.65 1.12 7.93
- HASH = function(x, size) fastmatch:::mk.hash(x = x, size = size)
- APPEND = function(x, what) fastmatch:::append.hash(hash = x, x = what, index = FALSE)
- HTABLE = function(x) fastmatch:::levels.fasthash(x)
- ff3 = function(x, size)
- {
- h = HASH(double(), size)
- for(elt in x) h = APPEND(h, elt)
- return(.POSIXct(HTABLE(h)))
- }
- system.time({ ans3 = ff3(ts, sum(lengths(ts)) / 1e2) }) #an estimate of unique values
- # user system elapsed
- # 4.81 0.00 4.87
- system.time({ ans3b = ff3(ts, length(ts[[1]])) }) #we know the number of uniques
- # user system elapsed
- # 2.03 0.03 2.10
- all.equal(ans1, ans2)
- #[1] TRUE
- all.equal(ans2, ans3)
- #[1] TRUE
- set.seed(1821)
- tmp = split(sample(1e2, 26, TRUE) + 0, rep(1:4, c(6, 3, 11, 6)))
- identical(unique(unlist(tmp)), as.double(ff1(tmp)))
- #[1] TRUE
- identical(unique(unlist(tmp)), as.double(ff2(tmp)))
- #[1] TRUE
- identical(unique(unlist(tmp)), as.double(ff3(tmp, 1e2)))
- #[1] TRUE
- R> ts <- lapply(c(1:500), function(x) seq(as.POSIXct("2000/1/1"), as.POSIXct("2017/1/1"), "hours"))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221687 11.9 460000 24.6 392929 21
- Vcells 74925836 571.7 112760349 860.3 79427802 606
- R> t <- unique(do.call("c", ts))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221729 11.9 460000 24.6 392929 21.0
- Vcells 75074953 572.8 413082177 3151.6 507227169 3869.9
- R> print(object.size(ts), units="MB")
- 568.8 Mb
- R> sessionInfo()
- R version 3.3.2 (2016-10-31)
- Platform: x86_64-pc-linux-gnu (64-bit)
- Running under: Ubuntu 14.04.5 LTS
- locale:
- [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
- [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
- [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
- attached base packages:
- [1] stats graphics grDevices utils datasets methods base
- R> ts <- lapply(1:500, function(x) seq(as.POSIXct("2000-01-01"), as.POSIXct("2017-01-01"), "hours"))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221686 11.9 460000 24.6 371201 19.9
- Vcells 74925836 571.7 111681359 852.1 80924280 617.5
- R> u <- do.call("c", c(ts, recursive = FALSE, use.names = FALSE))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221725 11.9 460000 24.6 371201 19.9
- Vcells 149446409 1140.2 413082872 3151.6 373009943 2845.9
- R> ts <- lapply(1:500, function(x) seq(as.POSIXct("2000-01-01"), as.POSIXct("2017-01-01"), "hours"))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221686 11.9 460000 24.6 371201 19.9
- Vcells 74925836 571.7 111681359 852.1 80924280 617.5
- R> u <- .POSIXct(unlist(ts, recursive = FALSE, use.names = FALSE))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 221695 11.9 460000 24.6 371201 19.9
- Vcells 149446337 1140.2 358453576 2734.8 298487368 2277.3
- R> ts <- lapply(c(1:500), function(x) seq(as.POSIXct("2000/1/1"), as.POSIXct("2017/1/1"), "hours"))
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 218429 11.7 460000 24.6 389555 20.9
- Vcells 74922694 571.7 111432506 850.2 81226910 619.8
- R> u <- Reduce(function(x, y) unique(c(x, y)), ts)
- R> gc()
- used (Mb) gc trigger (Mb) max used (Mb)
- Ncells 218893 11.7 460000 24.6 389555 20.9
- Vcells 75072416 572.8 111432506 850.2 111399894 850.0
Add Comment
Please, Sign In to add comment