Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- id <- c("a", "b", "c", "d")
- in05 <- c(1, 0, 0, 1)
- in06 <- c(0, 0, 0, 1)
- in07 <- c(1, 1, 0, 1)
- in08 <- c(0, 1, 1, 1)
- in09 <- c(0, 0, 0, 1)
- df <- data.frame(id, in05, in06, in07, in08, in09)
- df$firstyear <- ifelse(df$in05==1,2005,
- ifelse(df$in06==1,2006,
- ifelse(df$in07==1, 2007,
- ifelse(df$in08==1, 2008,
- ifelse(df$in09==1, 2009,
- 0)))))
- indx <- names(df)[max.col(df[-1], ties.method = "first") + 1L]
- df$firstyear <- as.numeric(sub("in", "20", indx))
- df
- # id in05 in06 in07 in08 in09 firstyear
- # 1 a 1 0 1 0 0 2005
- # 2 b 0 0 1 1 0 2007
- # 3 c 0 0 0 1 0 2008
- # 4 d 1 1 1 1 1 2005
- df$FirstYear <- gsub('in', '20', names(df))[apply(df, 1, match, x=1)]
- df
- id in05 in06 in07 in08 in09 FirstYear
- 1 a 1 0 1 0 0 2005
- 2 b 0 0 1 1 0 2007
- 3 c 0 0 0 1 0 2008
- 4 d 1 1 1 1 1 2005
- x = 5; set.seed(199); tab = sample(1e6)
- identical(match(x, tab), which.max(x == tab))
- #[1] TRUE
- microbenchmark::microbenchmark(match(x, tab), which.max(x == tab), times = 25)
- #Unit: milliseconds
- # expr min lq median uq max neval
- # match(x, tab) 142.22327 142.50103 142.79737 143.19547 145.37669 25
- # which.max(x == tab) 18.91427 18.93728 18.96225 19.58932 38.34253 25
- ff = function(x)
- {
- x = as.list(x)
- ans = as.integer(x[[1]])
- for(i in 2:length(x)) {
- inds = ans == 0L
- if(!any(inds)) return(ans)
- ans[inds] = i * (x[[i]][inds] == 1)
- }
- return(ans)
- }
- david = function(x) max.col(x, "first")
- plafort = function(x) apply(x, 1, match, x = 1)
- ff(df[-1])
- #[1] 1 3 4 1
- david(df[-1])
- #[1] 1 3 4 1
- plafort(df[-1])
- #[1] 1 3 4 1
- set.seed(007)
- DF = data.frame(id = seq_len(1e6),
- "colnames<-"(matrix(sample(0:1, 1e7, T, c(0.25, 0.75)), 1e6),
- paste("in", 11:20, sep = "")))
- identical(ff(DF[-1]), david(DF[-1]))
- #[1] TRUE
- identical(ff(DF[-1]), plafort(DF[-1]))
- #[1] TRUE
- microbenchmark::microbenchmark(ff(DF[-1]), david(DF[-1]), as.matrix(DF[-1]), times = 30)
- #Unit: milliseconds
- # expr min lq median uq max neval
- # ff(DF[-1]) 64.83577 65.45432 67.87486 70.32073 86.72838 30
- # david(DF[-1]) 112.74108 115.12361 120.16118 132.04803 145.45819 30
- # as.matrix(DF[-1]) 20.87947 22.01819 27.52460 32.60509 45.84561 30
- system.time(plafort(DF[-1]))
- # user system elapsed
- # 4.117 0.000 4.125
- years <- as.integer(substr(names(df[-1]), 3, 4)) + 2000L
- cbind(df, yr=do.call(pmin.int, Map(`/`, years, df[-1])))
- id in05 in06 in07 in08 in09 yr
- 1 a 1 0 1 0 0 2005
- 2 b 0 0 1 1 0 2007
- 3 c 0 0 0 1 0 2008
- 4 d 1 1 1 1 1 2005
- Unit: milliseconds
- expr min lq median uq max neval
- do.call(pmin.int, Map(`/`, 11:20, DF[-1])) 178.46993 194.3760 219.8898 229.1597 307.1120 10
- ff(DF[-1]) 416.07297 434.0792 439.1970 452.8345 496.2048 10
- max.col(DF[-1], "first") 99.71936 138.2285 175.2334 207.6365 239.6519 10
- ff2 = function(x) {
- ans = as.integer(x[[1]])
- for(i in 2:length(x)) {
- inds = which(ans == 0L)
- if(!length(inds)) return(ans)
- ans[inds] = i * (x[[i]][inds] == 1)
- }
- return(ans)
- }
- Unit: milliseconds
- expr min lq median uq max neval
- ff(DF[-1]) 407.92699 415.11716 421.18274 428.02092 462.2474 10
- ff2(DF[-1]) 64.20484 72.74729 79.85748 81.29153 148.6439 10
- # Tidy
- df <- df %>%
- gather(year, present.or.not, -id)
- # Create df of first instances
- first.df <- df %>%
- group_by(id, present.or.not) %>%
- mutate(ranky = rank(cumsum(present.or.not)),
- first.year = year) %>%
- filter(ranky == 1)
- # Prepare for join
- first.df <- first.df[,c('id', 'first.year')]
- # Join with original
- df <- left_join(df,first.df)
- # Spread
- spread(df, year, present.or.not)
- df %>%
- gather(year, present_or_not, -id) %>%
- filter(present_or_not==1) %>%
- group_by(id) %>%
- arrange(id, year) %>%
- slice(1) %>%
- mutate(year = str_replace(year, "in", "20")) %>%
- select(1:2) %>%
- right_join(df)`
- # Using version 0.5.0.
- # Dev version may work without `with()`.
- df %>%
- mutate(., firstyear = with(., case_when(
- in05 == 1 ~ 2005,
- in06 == 1 ~ 2006,
- in07 == 1 ~ 2007,
- in08 == 1 ~ 2008,
- in09 == 1 ~ 2009,
- TRUE ~ 0
- )))
- library(tidyr)
- library(sqldf)
- newdf <- gather(df, year, code, -id)
- df$firstyear <- sqldf('SELECT min(rowid) rowid, id, year as firstyear
- FROM newdf
- WHERE code = 1
- GROUP BY id')[3]
- library(tidyr)
- df2 <- gather(df, year, code, -id)
- df2 <- df2[df2$code == 1, 1:2]
- df2 <- df2[!duplicated(df2$id), ]
- merge(df, df2)
- library(tidyr)
- library(dplyr)
- newdf <- gather(df, year, code, -id)
- df$firstyear <- (newdf %>%
- filter(code==1) %>%
- select(id, year) %>%
- group_by(id) %>%
- summarise(first = first(year)))[2]
- id in05 in06 in07 in08 in09 year
- 1 a 1 0 1 0 0 in05
- 2 b 0 0 1 1 0 in07
- 3 c 0 0 0 1 0 in08
- 4 d 1 1 1 1 1 in05
- names(df) <- c("id", 2005, 2006, 2007, 2008, 2009)
- df$firstyear <- names(df[-1])[apply(df[-1], 1, which.max)]
- id 2005 2006 2007 2008 2009 firstyear
- 1 a 1 0 1 0 0 2005
- 2 b 0 0 1 1 0 2007
- 3 c 0 0 0 1 0 2008
- 4 d 1 1 1 1 1 2005
- df$firstYear <- gsub('in', '20', names(df[-1]))[apply(df[-1], 1, which.max)]
- id in05 in06 in07 in08 in09 firstYear
- 1 a 1 0 1 0 0 2005
- 2 b 0 0 1 1 0 2007
- 3 c 0 0 0 1 0 2008
- 4 d 1 1 1 1 1 2005
Add Comment
Please, Sign In to add comment