Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(pipeR)
- library(stringr)
- library(data.table)
- library(fastmatch)
- library(plyr)
- library(text2vec)
- library(Matrix)
- # 資料生成
- numPlayers <- 500
- numGames <- 300000
- namePlayers <- sprintf("P_%03d", 1:numPlayers)
- getCombinedFunc <- function(data, numSampling, numGroup) {
- DT <- data.table(V = sample(data, numGroup * numSampling, TRUE),
- i = rep(1:numSampling, each = numGroup,
- length.out = numGroup * numSampling), key = "i")
- # 確保每一列都是五個不同的PlayerNames
- uniqueDT <- unique(DT)
- while (nrow(uniqueDT) < numSampling * numGroup) {
- tmpDT <- uniqueDT[ , .N, by = .(i)][N < 5][ , N := 5 - N]
- uniqueDT <- rbind(uniqueDT, data.table(V = sample(data, nrow(tmpDT), TRUE),
- i = tmpDT$i)) %>>% unique
- }
- return(uniqueDT[ , .(combinedV = str_c(V, collapse = ",")), by = .(i)]$combinedV)
- }
- # 測一下生成時間
- system.time(getCombinedFunc(namePlayers, numGames, 5)) # 1.64 seconds
- # 生成目標資料表
- DT <- data.table(attack = getCombinedFunc(namePlayers, numGames, 5),
- defence = getCombinedFunc(namePlayers, numGames, 5))
- # 修改自andrew大的方法
- andrew <- function(data, name.player) {
- out.attack <- strsplit(data[[1]], ",") %>>%
- sapply(function(x) name.player %in% x) %>>% t %>>%
- `colnames<-`(str_c("attack_", name.player)) %>>%
- mapvalues(c(TRUE, FALSE), c(1L, 0L), FALSE)
- out.defence <- strsplit(data[[2]], ",") %>>%
- sapply(function(x) name.player %in% x) %>>% t %>>%
- `colnames<-`(str_c("defense_", name.player)) %>>%
- mapvalues(c(TRUE, FALSE), c(-1L, 0L), FALSE)
- cbind(out.attack, out.defence)
- }
- # 修改自wush大的方法
- wush <- function(data, name.player) {
- it <- itoken(data[[1]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
- it2 <- itoken(data[[2]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
- vocab <- create_vocabulary(name.player)
- vectorizer <- vocab_vectorizer(vocab)
- m1 <- create_dtm(it, vectorizer)
- colnames(m1) <- str_c("attack_", colnames(m1))
- m2 <- create_dtm(it2, vectorizer)
- colnames(m2) <- str_c("defense_", colnames(m2))
- m2@x[] <- -1
- cbind(m1, m2)
- }
- # 我的方法
- getLocMatFunc <- function(x, table, value = 1, colnames = NULL) {
- tmp <- str_split(x, ",")
- # 找出column位置
- j <- fmatch(do.call(c, tmp), table)
- # 找出row位置
- i <- do.call(c, mapply(function(i, x) rep(i, length(x)), seq_along(tmp), tmp, SIMPLIFY = FALSE))
- # 產生出sparse matrix
- sparseMatrix(i, j, x = value, dims = c(length(x), length(table)), dimnames = list(NULL, colnames))
- }
- getMatrixFunc <- function(DT, namePlayers) {
- cbind(getLocMatFunc(DT$attack, namePlayers, 1., str_c("attack_", namePlayers)),
- getLocMatFunc(DT$defence, namePlayers, -1., str_c("defense_", namePlayers)))
- }
- # check結果
- Andrew <- andrew(DT, namePlayers)
- Wush <- wush(DT, namePlayers)
- rownames(Wush) <- NULL
- MyMethod <- getMatrixFunc(DT, namePlayers)
- all.equal(Wush, Matrix(Andrew, sparse = TRUE)) # TRUE
- all.equal(MyMethod, Wush) # TRUE
- all.equal(MyMethod, Matrix(Andrew, sparse = TRUE)) # TRUE
- # 使用microbenchmark
- library(microbenchmark)
- microbenchmark(
- Andrew = andrew(DT, namePlayers),
- Wush = wush(DT, namePlayers),
- MyMethod = getMatrixFunc(DT, namePlayers),
- times = 10L
- )
- # Unit: seconds
- # expr min lq mean median uq max neval
- # Andrew 25.564674 25.631636 26.357786 26.429542 26.804092 27.312797 10
- # Wush 8.051787 8.127275 8.327858 8.319552 8.556822 8.621760 10
- # MyMethod 1.978885 2.033370 2.240003 2.145650 2.334539 2.959432 10
Advertisement
Add Comment
Please, Sign In to add comment