Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(pipeR)
- library(zoo)
- library(plyr)
- # generate data
- numDays <- 20
- numID <- 3000
- numRecords <- numDays * numID * 0.5
- set.seed(5)
- DT <- data.table(matrix(rnorm(numDays * numID), numID)) %>>% `[`(j = ID := 1:numID) %>>%
- melt.data.table(id.vars = numDays + 1, variable.name = "Day", value.name = "X") %>>%
- `[`(j = Day := as.integer(mapvalues(Day, paste0("V", 1:numDays), 1:numDays))) %>>%
- `[`(i = sample(nrow(.), numRecords)) %>>% setorder(ID, Day)
- DT[ , Day2 := factor(Day, levels = seq(min(DT$Day), max(DT$Day)))]
- setkey(DT, ID, Day)
- # check the number of observation is greater than 1
- # DT[ , .N, by = .(ID)] %>>% `[[`("N") %>>% `!=`(1) %>>% all # TRUE
- mean2 <- function(x) {
- if (length(x) == 1 || all(is.na(x)))
- return(NA)
- mean(head(x, length(x) - 1), na.rm = TRUE)
- }
- f1 <- function(DT) {
- DT2 <- dcast.data.table(DT, ID ~ Day2, sum, fill = NA, drop = FALSE, value.var = "X")
- meanDT <- DT2[ , 2:ncol(DT2)] %>>% as.matrix %>>% t %>>%
- rollapply(8, mean2, partial = TRUE, align = "right") %>>% data.table %>>%
- melt.data.table(measure.var = 1:ncol(.), variable.name = "ID", value.name = "x_mean",
- variable.factor = FALSE) %>>%
- `[`(j = Day := 1:(ncol(DT2)-1), by = .(ID)) %>>%
- `[`(j = ID := as.integer(mapvalues(ID, paste0("V", 1:nrow(DT2)), DT2$ID)))
- return(merge(DT, meanDT, by = c("ID", "Day")))
- }
- f2 <- function(DT) {
- DT[CJ(unique(ID), seq(min(Day), max(Day)))] %>>%
- `[`(j = .(Day = rollapply(Day, 8, max, partial = TRUE, align = "right"),
- x_mean = rollapply(X, 8, mean2, partial = TRUE, align = "right")),
- by = .(ID)) %>>%
- merge(DT, by = c("ID", "Day"), all.y = TRUE)
- }
- mvAvg_f <- function(val, day, span = 7) {
- sapply(day, function(y){
- tmp <- day - y
- mean(val[which(tmp < 0 & tmp >= -span)])
- })
- }
- f3 <- function(DT) {
- DT[ , x_mean_2 := mvAvg_f(X, Day), by = .(ID)]
- }
- f4 <- function(DT) {
- DT[ , x_mean_3 := sapply(Day, function(s) mean(X[between(Day, s-7, s-1)])), by = .(ID)]
- }
- f5 <- function(DT) {
- DT[ , x_mean_4 := sapply(Day, function(s) mean(X[Day >= s-7 & Day < s])), by = .(ID)]
- }
- resDT1 <- f1(DT)
- resDT2 <- f2(DT)
- all.equal(resDT1, resDT2, ignore.col.order = TRUE) # TRUE
- f3(DT)
- f4(DT)
- f5(DT)
- all.equal(resDT1$x_mean, DT$x_mean_2, ignore.col.order = TRUE) # TRUE
- all.equal(resDT1$x_mean, DT$x_mean_3, ignore.col.order = TRUE) # TRUE
- all.equal(resDT1$x_mean, DT$x_mean_4, ignore.col.order = TRUE) # TRUE
- library(microbenchmark)
- microbenchmark(f1(DT), f2(DT), f3(DT), f4(DT), f5(DT), times = 10L)
- # Unit: milliseconds
- # expr min lq mean median uq max neval
- # f1(DT) 4879.9619 4909.4480 4961.4962 4938.1009 4997.4011 5152.2432 10
- # f2(DT) 4051.7383 4095.4616 4133.2843 4133.9591 4162.8146 4245.0343 10
- # f3(DT) 321.6416 322.3145 324.9348 323.4118 325.8534 331.5509 10
- # f4(DT) 1942.0352 1956.5268 1992.7289 1979.8526 2014.8336 2096.0038 10
- # f5(DT) 149.6435 151.4261 155.0831 152.9530 161.5350 162.9220 10
Advertisement
Add Comment
Please, Sign In to add comment