produce sparse matrix

library(pipeR)
library(stringr)
library(data.table)
library(fastmatch)
library(plyr)
library(text2vec)
library(Matrix)

# 資料生成
numPlayers <- 500
numGames <- 300000
namePlayers <- sprintf("P_%03d", 1:numPlayers)

getCombinedFunc <- function(data, numSampling, numGroup) {
  DT <- data.table(V = sample(data, numGroup * numSampling, TRUE),
                   i = rep(1:numSampling, each = numGroup,
                           length.out = numGroup * numSampling), key = "i")
  # 確保每一列都是五個不同的PlayerNames
  uniqueDT <- unique(DT)
  while (nrow(uniqueDT) < numSampling * numGroup) {
    tmpDT <- uniqueDT[ , .N, by = .(i)][N < 5][ , N := 5 - N]
    uniqueDT <- rbind(uniqueDT, data.table(V = sample(data, nrow(tmpDT), TRUE),
                                           i = tmpDT$i)) %>>% unique
  }
  return(uniqueDT[ , .(combinedV = str_c(V, collapse = ",")), by = .(i)]$combinedV)
}
# 測一下生成時間
system.time(getCombinedFunc(namePlayers, numGames, 5)) # 1.64 seconds

# 生成目標資料表
DT <- data.table(attack = getCombinedFunc(namePlayers, numGames, 5),
                 defence = getCombinedFunc(namePlayers, numGames, 5))

# 修改自andrew大的方法
andrew <- function(data, name.player) {
  out.attack <- strsplit(data[[1]], ",") %>>%
    sapply(function(x) name.player %in% x) %>>% t %>>%
    `colnames<-`(str_c("attack_", name.player)) %>>%
    mapvalues(c(TRUE, FALSE), c(1L, 0L), FALSE)
  out.defence <- strsplit(data[[2]], ",") %>>%
    sapply(function(x) name.player %in% x) %>>% t %>>%
    `colnames<-`(str_c("defense_", name.player)) %>>%
    mapvalues(c(TRUE, FALSE), c(-1L, 0L), FALSE)
  cbind(out.attack, out.defence)
}

# 修改自wush大的方法
wush <- function(data, name.player) {
  it <- itoken(data[[1]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  it2 <- itoken(data[[2]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  vocab <- create_vocabulary(name.player)
  vectorizer <- vocab_vectorizer(vocab)
  m1 <- create_dtm(it, vectorizer)
  colnames(m1) <- str_c("attack_", colnames(m1))
  m2 <- create_dtm(it2, vectorizer)
  colnames(m2) <- str_c("defense_", colnames(m2))
  m2@x[] <- -1
  cbind(m1, m2)
}

# 我的方法
getLocMatFunc <- function(x, table, value = 1, colnames = NULL) {
  tmp <- str_split(x, ",")
  # 找出column位置
  j <- fmatch(do.call(c, tmp), table)
  # 找出row位置
  i <- do.call(c, mapply(function(i, x) rep(i, length(x)), seq_along(tmp), tmp, SIMPLIFY = FALSE))
  # 產生出sparse matrix
  sparseMatrix(i, j, x = value, dims = c(length(x), length(table)), dimnames = list(NULL, colnames))
}

getMatrixFunc <- function(DT, namePlayers) {
  cbind(getLocMatFunc(DT$attack, namePlayers, 1., str_c("attack_", namePlayers)),
        getLocMatFunc(DT$defence, namePlayers, -1., str_c("defense_", namePlayers)))
}

# check結果
Andrew <- andrew(DT, namePlayers)
Wush <- wush(DT, namePlayers)
rownames(Wush) <- NULL
MyMethod <- getMatrixFunc(DT, namePlayers)

all.equal(Wush, Matrix(Andrew, sparse = TRUE)) # TRUE
all.equal(MyMethod, Wush) # TRUE
all.equal(MyMethod, Matrix(Andrew, sparse = TRUE)) # TRUE

# 使用microbenchmark
library(microbenchmark)
microbenchmark(
  Andrew = andrew(DT, namePlayers),
  Wush = wush(DT, namePlayers),
  MyMethod = getMatrixFunc(DT, namePlayers),
  times = 10L
)
# Unit: seconds
#      expr       min        lq      mean    median        uq       max neval
#    Andrew 25.564674 25.631636 26.357786 26.429542 26.804092 27.312797    10
#      Wush  8.051787  8.127275  8.327858  8.319552  8.556822  8.621760    10
#  MyMethod  1.978885  2.033370  2.240003  2.145650  2.334539  2.959432    10