celestialgod

data precessing

Apr 23rd, 2015
391
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 2.23 KB | None | 0 0
  1. library(data.table)
  2. library(plyr)
  3. library(dplyr)
  4. library(tidyr)
  5. library(reshape2)
  6. library(magrittr)
  7. P = 50000
  8. N = 200
  9. n_var = sample(1:6, 50000, TRUE)
  10. p_var = rbeta(P, 3, 3)
  11. dat = sapply(seq_len(P), function(i){
  12.     if(runif(1) < 0.05) return(sample(1:6,1))
  13.     else rbinom(N, n_var[i], p_var[i])
  14.   }) %>% tbl_dt(FALSE) %>% setnames(paste0("X", 1:50000))
  15.  
  16. preprocess_f = function(dat){
  17.   dataFilter = dat[,sapply(dat, function(x) length(table(x))) > 1, with = FALSE]
  18.   cateCount = apply(dataFilter, 2, table) %>% melt() %>% acast(L1~Var1,value.var="value")
  19.   minProportion = ldply(dataFilter, function(x){
  20.     tmp = table(x)/length(x)
  21.     loc = which.min(tmp)
  22.     c(names(tmp)[loc], tmp[loc])
  23.   })
  24.   list(dataFilter, cateCount, minProportion)
  25. }
  26.  
  27. preprocess_f2 = function(dat){
  28.   cateCount = apply(dat, 2, table) %>% melt() %>%
  29.     acast(L1~Var1,value.var="value", fill=0) %>%
  30.     t() %>% data.frame()
  31.   dataFilter = dat[, colSums(cateCount > 0) > 1, with=FALSE]
  32.   minProportion = ldply(cateCount, function(x) c(which.min(x[x > 0]),
  33.     min(x[x > 0]) / 200))
  34.   list(dataFilter, cateCount, minProportion)
  35. }
  36.  
  37. preprocess_f3 = function(dat){
  38.   cateCount = gather(dat) %>% dcast.data.table(value~key, length) %>%
  39.     tbl_dt() %>% select(-value)
  40.   dataFilter = dat %>% select(which(colSums(cateCount > 0) > 1))
  41.   minProportion = ldply(cateCount, function(x) c(which.min(x[x > 0]),
  42.     min(x[x > 0]) / 200))
  43.   list(dataFilter, cateCount, minProportion)
  44. }
  45. # dataFilters那行有一個假設,是每一個X的類別沒有類別是不出現的,他只計算有出現在資料上的類別。
  46.  
  47. st = proc.time()
  48. t1 = preprocess_f(dat)
  49. proc.time() - st
  50. #   user  system elapsed
  51. #  47.80    0.42   47.39
  52. st = proc.time()
  53. t2 = preprocess_f2(dat)
  54. proc.time() - st
  55. #   user  system elapsed
  56. #  40.35    0.23   40.03
  57. st = proc.time()
  58. t3 = preprocess_f3(dat)
  59. proc.time() - st
  60. #   user  system elapsed
  61. #  13.85    0.00   13.89
  62.  
  63. all.equal(t1$dataFilter, t2$dataFilter)
  64. # TRUE
  65. all.equal(t1$dataFilter, t3$dataFilter)
  66. # TRUE
  67. all.equal(t1$cateCount, t2$cateCount)
  68. # TRUE
  69. all.equal(t1$cateCount, t3$cateCount)
  70. # TRUE
  71. all.equal(t1$minProportion, t2$minProportion)
  72. # TRUE
  73. all.equal(t1$minProportion, t3$minProportion)
  74. # TRUE
Advertisement
Add Comment
Please, Sign In to add comment