Guest User

Untitled

a guest
Jan 21st, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.01 KB | None | 0 0
  1. id <- c("a", "b", "c", "d")
  2. in05 <- c(1, 0, 0, 1)
  3. in06 <- c(0, 0, 0, 1)
  4. in07 <- c(1, 1, 0, 1)
  5. in08 <- c(0, 1, 1, 1)
  6. in09 <- c(0, 0, 0, 1)
  7. df <- data.frame(id, in05, in06, in07, in08, in09)
  8.  
  9. df$firstyear <- ifelse(df$in05==1,2005,
  10. ifelse(df$in06==1,2006,
  11. ifelse(df$in07==1, 2007,
  12. ifelse(df$in08==1, 2008,
  13. ifelse(df$in09==1, 2009,
  14. 0)))))
  15.  
  16. indx <- names(df)[max.col(df[-1], ties.method = "first") + 1L]
  17. df$firstyear <- as.numeric(sub("in", "20", indx))
  18. df
  19. # id in05 in06 in07 in08 in09 firstyear
  20. # 1 a 1 0 1 0 0 2005
  21. # 2 b 0 0 1 1 0 2007
  22. # 3 c 0 0 0 1 0 2008
  23. # 4 d 1 1 1 1 1 2005
  24.  
  25. df$FirstYear <- gsub('in', '20', names(df))[apply(df, 1, match, x=1)]
  26. df
  27. id in05 in06 in07 in08 in09 FirstYear
  28. 1 a 1 0 1 0 0 2005
  29. 2 b 0 0 1 1 0 2007
  30. 3 c 0 0 0 1 0 2008
  31. 4 d 1 1 1 1 1 2005
  32.  
  33. x = 5; set.seed(199); tab = sample(1e6)
  34. identical(match(x, tab), which.max(x == tab))
  35. #[1] TRUE
  36. microbenchmark::microbenchmark(match(x, tab), which.max(x == tab), times = 25)
  37. #Unit: milliseconds
  38. # expr min lq median uq max neval
  39. # match(x, tab) 142.22327 142.50103 142.79737 143.19547 145.37669 25
  40. # which.max(x == tab) 18.91427 18.93728 18.96225 19.58932 38.34253 25
  41.  
  42. ff = function(x)
  43. {
  44. x = as.list(x)
  45. ans = as.integer(x[[1]])
  46. for(i in 2:length(x)) {
  47. inds = ans == 0L
  48. if(!any(inds)) return(ans)
  49. ans[inds] = i * (x[[i]][inds] == 1)
  50. }
  51. return(ans)
  52. }
  53.  
  54. david = function(x) max.col(x, "first")
  55. plafort = function(x) apply(x, 1, match, x = 1)
  56.  
  57. ff(df[-1])
  58. #[1] 1 3 4 1
  59. david(df[-1])
  60. #[1] 1 3 4 1
  61. plafort(df[-1])
  62. #[1] 1 3 4 1
  63.  
  64. set.seed(007)
  65. DF = data.frame(id = seq_len(1e6),
  66. "colnames<-"(matrix(sample(0:1, 1e7, T, c(0.25, 0.75)), 1e6),
  67. paste("in", 11:20, sep = "")))
  68. identical(ff(DF[-1]), david(DF[-1]))
  69. #[1] TRUE
  70. identical(ff(DF[-1]), plafort(DF[-1]))
  71. #[1] TRUE
  72. microbenchmark::microbenchmark(ff(DF[-1]), david(DF[-1]), as.matrix(DF[-1]), times = 30)
  73. #Unit: milliseconds
  74. # expr min lq median uq max neval
  75. # ff(DF[-1]) 64.83577 65.45432 67.87486 70.32073 86.72838 30
  76. # david(DF[-1]) 112.74108 115.12361 120.16118 132.04803 145.45819 30
  77. # as.matrix(DF[-1]) 20.87947 22.01819 27.52460 32.60509 45.84561 30
  78.  
  79. system.time(plafort(DF[-1]))
  80. # user system elapsed
  81. # 4.117 0.000 4.125
  82.  
  83. years <- as.integer(substr(names(df[-1]), 3, 4)) + 2000L
  84. cbind(df, yr=do.call(pmin.int, Map(`/`, years, df[-1])))
  85.  
  86. id in05 in06 in07 in08 in09 yr
  87. 1 a 1 0 1 0 0 2005
  88. 2 b 0 0 1 1 0 2007
  89. 3 c 0 0 0 1 0 2008
  90. 4 d 1 1 1 1 1 2005
  91.  
  92. Unit: milliseconds
  93. expr min lq median uq max neval
  94. do.call(pmin.int, Map(`/`, 11:20, DF[-1])) 178.46993 194.3760 219.8898 229.1597 307.1120 10
  95. ff(DF[-1]) 416.07297 434.0792 439.1970 452.8345 496.2048 10
  96. max.col(DF[-1], "first") 99.71936 138.2285 175.2334 207.6365 239.6519 10
  97.  
  98. ff2 = function(x) {
  99. ans = as.integer(x[[1]])
  100. for(i in 2:length(x)) {
  101. inds = which(ans == 0L)
  102. if(!length(inds)) return(ans)
  103. ans[inds] = i * (x[[i]][inds] == 1)
  104. }
  105. return(ans)
  106. }
  107.  
  108. Unit: milliseconds
  109. expr min lq median uq max neval
  110. ff(DF[-1]) 407.92699 415.11716 421.18274 428.02092 462.2474 10
  111. ff2(DF[-1]) 64.20484 72.74729 79.85748 81.29153 148.6439 10
  112.  
  113. # Tidy
  114. df <- df %>%
  115. gather(year, present.or.not, -id)
  116.  
  117. # Create df of first instances
  118. first.df <- df %>%
  119. group_by(id, present.or.not) %>%
  120. mutate(ranky = rank(cumsum(present.or.not)),
  121. first.year = year) %>%
  122. filter(ranky == 1)
  123.  
  124. # Prepare for join
  125. first.df <- first.df[,c('id', 'first.year')]
  126.  
  127. # Join with original
  128. df <- left_join(df,first.df)
  129.  
  130. # Spread
  131. spread(df, year, present.or.not)
  132.  
  133. df %>%
  134. gather(year, present_or_not, -id) %>%
  135. filter(present_or_not==1) %>%
  136. group_by(id) %>%
  137. arrange(id, year) %>%
  138. slice(1) %>%
  139. mutate(year = str_replace(year, "in", "20")) %>%
  140. select(1:2) %>%
  141. right_join(df)`
  142.  
  143. # Using version 0.5.0.
  144. # Dev version may work without `with()`.
  145. df %>%
  146. mutate(., firstyear = with(., case_when(
  147. in05 == 1 ~ 2005,
  148. in06 == 1 ~ 2006,
  149. in07 == 1 ~ 2007,
  150. in08 == 1 ~ 2008,
  151. in09 == 1 ~ 2009,
  152. TRUE ~ 0
  153. )))
  154.  
  155. library(tidyr)
  156. library(sqldf)
  157. newdf <- gather(df, year, code, -id)
  158. df$firstyear <- sqldf('SELECT min(rowid) rowid, id, year as firstyear
  159. FROM newdf
  160. WHERE code = 1
  161. GROUP BY id')[3]
  162.  
  163. library(tidyr)
  164. df2 <- gather(df, year, code, -id)
  165. df2 <- df2[df2$code == 1, 1:2]
  166. df2 <- df2[!duplicated(df2$id), ]
  167. merge(df, df2)
  168.  
  169. library(tidyr)
  170. library(dplyr)
  171. newdf <- gather(df, year, code, -id)
  172. df$firstyear <- (newdf %>%
  173. filter(code==1) %>%
  174. select(id, year) %>%
  175. group_by(id) %>%
  176. summarise(first = first(year)))[2]
  177.  
  178. id in05 in06 in07 in08 in09 year
  179. 1 a 1 0 1 0 0 in05
  180. 2 b 0 0 1 1 0 in07
  181. 3 c 0 0 0 1 0 in08
  182. 4 d 1 1 1 1 1 in05
  183.  
  184. names(df) <- c("id", 2005, 2006, 2007, 2008, 2009)
  185. df$firstyear <- names(df[-1])[apply(df[-1], 1, which.max)]
  186.  
  187. id 2005 2006 2007 2008 2009 firstyear
  188. 1 a 1 0 1 0 0 2005
  189. 2 b 0 0 1 1 0 2007
  190. 3 c 0 0 0 1 0 2008
  191. 4 d 1 1 1 1 1 2005
  192.  
  193. df$firstYear <- gsub('in', '20', names(df[-1]))[apply(df[-1], 1, which.max)]
  194.  
  195. id in05 in06 in07 in08 in09 firstYear
  196. 1 a 1 0 1 0 0 2005
  197. 2 b 0 0 1 1 0 2007
  198. 3 c 0 0 0 1 0 2008
  199. 4 d 1 1 1 1 1 2005
Add Comment
Please, Sign In to add comment