celestialgod

produce sparse matrix

Jan 8th, 2018
238
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.67 KB | None | 0 0
  1. library(pipeR)
  2. library(stringr)
  3. library(data.table)
  4. library(fastmatch)
  5. library(plyr)
  6. library(text2vec)
  7. library(Matrix)
  8.  
  9. # 資料生成
  10. numPlayers <- 500
  11. numGames <- 300000
  12. namePlayers <- sprintf("P_%03d", 1:numPlayers)
  13.  
  14. getCombinedFunc <- function(data, numSampling, numGroup) {
  15.   DT <- data.table(V = sample(data, numGroup * numSampling, TRUE),
  16.                    i = rep(1:numSampling, each = numGroup,
  17.                            length.out = numGroup * numSampling), key = "i")
  18.   # 確保每一列都是五個不同的PlayerNames
  19.   uniqueDT <- unique(DT)
  20.   while (nrow(uniqueDT) < numSampling * numGroup) {
  21.     tmpDT <- uniqueDT[ , .N, by = .(i)][N < 5][ , N := 5 - N]
  22.     uniqueDT <- rbind(uniqueDT, data.table(V = sample(data, nrow(tmpDT), TRUE),
  23.                                            i = tmpDT$i)) %>>% unique
  24.   }
  25.   return(uniqueDT[ , .(combinedV = str_c(V, collapse = ",")), by = .(i)]$combinedV)
  26. }
  27. # 測一下生成時間
  28. system.time(getCombinedFunc(namePlayers, numGames, 5)) # 1.64 seconds
  29.  
  30. # 生成目標資料表
  31. DT <- data.table(attack = getCombinedFunc(namePlayers, numGames, 5),
  32.                  defence = getCombinedFunc(namePlayers, numGames, 5))
  33.  
  34. # 修改自andrew大的方法
  35. andrew <- function(data, name.player) {
  36.   out.attack <- strsplit(data[[1]], ",") %>>%
  37.     sapply(function(x) name.player %in% x) %>>% t %>>%
  38.     `colnames<-`(str_c("attack_", name.player)) %>>%
  39.     mapvalues(c(TRUE, FALSE), c(1L, 0L), FALSE)
  40.   out.defence <- strsplit(data[[2]], ",") %>>%
  41.     sapply(function(x) name.player %in% x) %>>% t %>>%
  42.     `colnames<-`(str_c("defense_", name.player)) %>>%
  43.     mapvalues(c(TRUE, FALSE), c(-1L, 0L), FALSE)
  44.   cbind(out.attack, out.defence)
  45. }
  46.  
  47. # 修改自wush大的方法
  48. wush <- function(data, name.player) {
  49.   it <- itoken(data[[1]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  50.   it2 <- itoken(data[[2]], tokenizer = word_tokenizer, progressbar = FALSE, n_chunks = 10)
  51.   vocab <- create_vocabulary(name.player)
  52.   vectorizer <- vocab_vectorizer(vocab)
  53.   m1 <- create_dtm(it, vectorizer)
  54.   colnames(m1) <- str_c("attack_", colnames(m1))
  55.   m2 <- create_dtm(it2, vectorizer)
  56.   colnames(m2) <- str_c("defense_", colnames(m2))
  57.   m2@x[] <- -1
  58.   cbind(m1, m2)
  59. }
  60.  
  61. # 我的方法
  62. getLocMatFunc <- function(x, table, value = 1, colnames = NULL) {
  63.   tmp <- str_split(x, ",")
  64.   # 找出column位置
  65.   j <- fmatch(do.call(c, tmp), table)
  66.   # 找出row位置
  67.   i <- do.call(c, mapply(function(i, x) rep(i, length(x)), seq_along(tmp), tmp, SIMPLIFY = FALSE))
  68.   # 產生出sparse matrix
  69.   sparseMatrix(i, j, x = value, dims = c(length(x), length(table)), dimnames = list(NULL, colnames))
  70. }
  71.  
  72. getMatrixFunc <- function(DT, namePlayers) {
  73.   cbind(getLocMatFunc(DT$attack, namePlayers, 1., str_c("attack_", namePlayers)),
  74.         getLocMatFunc(DT$defence, namePlayers, -1., str_c("defense_", namePlayers)))
  75. }
  76.  
  77. # check結果
  78. Andrew <- andrew(DT, namePlayers)
  79. Wush <- wush(DT, namePlayers)
  80. rownames(Wush) <- NULL
  81. MyMethod <- getMatrixFunc(DT, namePlayers)
  82.  
  83. all.equal(Wush, Matrix(Andrew, sparse = TRUE)) # TRUE
  84. all.equal(MyMethod, Wush) # TRUE
  85. all.equal(MyMethod, Matrix(Andrew, sparse = TRUE)) # TRUE
  86.  
  87. # 使用microbenchmark
  88. library(microbenchmark)
  89. microbenchmark(
  90.   Andrew = andrew(DT, namePlayers),
  91.   Wush = wush(DT, namePlayers),
  92.   MyMethod = getMatrixFunc(DT, namePlayers),
  93.   times = 10L
  94. )
  95. # Unit: seconds
  96. #      expr       min        lq      mean    median        uq       max neval
  97. #    Andrew 25.564674 25.631636 26.357786 26.429542 26.804092 27.312797    10
  98. #      Wush  8.051787  8.127275  8.327858  8.319552  8.556822  8.621760    10
  99. #  MyMethod  1.978885  2.033370  2.240003  2.145650  2.334539  2.959432    10
Advertisement
Add Comment
Please, Sign In to add comment