Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(plyr)
- library(dplyr)
- library(tidyr)
- library(reshape2)
- library(magrittr)
- P = 50000
- N = 200
- n_var = sample(1:6, 50000, TRUE)
- p_var = rbeta(P, 3, 3)
- dat = sapply(seq_len(P), function(i){
- if(runif(1) < 0.05) return(sample(1:6,1))
- else rbinom(N, n_var[i], p_var[i])
- }) %>% tbl_dt(FALSE) %>% setnames(paste0("X", 1:50000))
- preprocess_f = function(dat){
- dataFilter = dat[,sapply(dat, function(x) length(table(x))) > 1, with = FALSE]
- cateCount = apply(dataFilter, 2, table) %>% melt() %>% acast(L1~Var1,value.var="value")
- minProportion = ldply(dataFilter, function(x){
- tmp = table(x)/length(x)
- loc = which.min(tmp)
- c(names(tmp)[loc], tmp[loc])
- })
- list(dataFilter, cateCount, minProportion)
- }
- preprocess_f2 = function(dat){
- cateCount = apply(dat, 2, table) %>% melt() %>%
- acast(L1~Var1,value.var="value", fill=0) %>%
- t() %>% data.frame()
- dataFilter = dat[, colSums(cateCount > 0) > 1, with=FALSE]
- minProportion = ldply(cateCount, function(x) c(which.min(x[x > 0]),
- min(x[x > 0]) / 200))
- list(dataFilter, cateCount, minProportion)
- }
- preprocess_f3 = function(dat){
- cateCount = gather(dat) %>% dcast.data.table(value~key, length) %>%
- tbl_dt() %>% select(-value)
- dataFilter = dat %>% select(which(colSums(cateCount > 0) > 1))
- minProportion = ldply(cateCount, function(x) c(which.min(x[x > 0]),
- min(x[x > 0]) / 200))
- list(dataFilter, cateCount, minProportion)
- }
- # dataFilters那行有一個假設,是每一個X的類別沒有類別是不出現的,他只計算有出現在資料上的類別。
- st = proc.time()
- t1 = preprocess_f(dat)
- proc.time() - st
- # user system elapsed
- # 47.80 0.42 47.39
- st = proc.time()
- t2 = preprocess_f2(dat)
- proc.time() - st
- # user system elapsed
- # 40.35 0.23 40.03
- st = proc.time()
- t3 = preprocess_f3(dat)
- proc.time() - st
- # user system elapsed
- # 13.85 0.00 13.89
- all.equal(t1$dataFilter, t2$dataFilter)
- # TRUE
- all.equal(t1$dataFilter, t3$dataFilter)
- # TRUE
- all.equal(t1$cateCount, t2$cateCount)
- # TRUE
- all.equal(t1$cateCount, t3$cateCount)
- # TRUE
- all.equal(t1$minProportion, t2$minProportion)
- # TRUE
- all.equal(t1$minProportion, t3$minProportion)
- # TRUE
Advertisement
Add Comment
Please, Sign In to add comment