celestialgod

parse transaction details

Apr 19th, 2017
316
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 3.37 KB | None | 0 0
  1. library(data.table)
  2. library(pipeR)
  3. library(stringr)
  4. library(iterators)
  5. library(foreach)
  6. library(doParallel)
  7.  
  8. # generate data
  9. set.seed(100) # for reproducing
  10. numCustomer <- 1e4L
  11. numPurchase <- sample.int(10L, numCustomer, TRUE)
  12.  
  13. out <- data.table(id = 1L:numCustomer, payment = sample(LETTERS[1:3], numCustomer, TRUE),
  14.                   readTime = Sys.time() - sample.int(86400*10, numCustomer, TRUE),
  15.                   items = lapply(numPurchase, function(x){
  16.                     data.table(PN = sample(letters, x), price = abs(rnorm(x, 100, 20)),
  17.                                number = sample.int(100, x))
  18.                   }))
  19.  
  20. if (file.exists("transRecord.txt"))
  21.   unlink("transRecord.txt")
  22.  
  23. st <- proc.time()
  24. fileConn <- file("transRecord.txt", "w+")
  25. foreach(it = iter(out, by = "row")) %do%
  26. {
  27.   timeStr <- format(it$readTime, "%Y%m%d%H%M%S")
  28.   cat(sprintf("customerid: %i\npayment: %s\nReadTime: %s\n",
  29.                 it$id, it$payment, timeStr), file = fileConn, append = TRUE)
  30.   foreach(it2 = iter(it$items, by = "row")) %do%
  31.   {
  32.     cat(sprintf("%sPruductNumber: %s: NT%010.4fNumber: %i\n",
  33.                   timeStr, it2$PN, it2$price, it2$number), file = fileConn, append = TRUE)
  34.   }
  35. } %>>% invisible
  36. close(fileConn)
  37. proc.time() - st
  38. #  user  system elapsed
  39. # 39.50    0.14   40.11
  40. file.size("transRecord.txt") / 2^20 # 3.508 MB
  41.  
  42. # start parsing
  43. cl <- makeCluster(detectCores() - 1L)
  44. registerDoParallel(cl)
  45.  
  46. lines <- readLines("transRecord.txt")
  47. basicInfoColNames <- c("customerid", "payment", "ReadTime")
  48. transInfoColNames <- c("transTime", "PruductNumber", "price", "number")
  49.  
  50. st <- proc.time()
  51. it <- isplit(lines, cumsum(str_detect(lines, "^customerid")))
  52. resDT <- foreach(it = it, .combine = rbind, .multicombine = TRUE,
  53.         .packages = c("data.table", "stringr", "pipeR")) %dopar%
  54. {
  55.   idx <- which(str_detect(it$value, "^ReadTime"))
  56.   part1 <- str_match(it$value[1:idx], "(customerid|payment|ReadTime): (.*)")
  57.   part2 <- str_match(it$value[(idx + 1):length(it$value)],
  58.                      "(\\d{14})PruductNumber: ([a-zA-Z0-9]+): NT(\\d{5}\\.\\d{4})Number: (\\d+)")
  59.   outDT <- data.table(t(part1[match(part1[ , 2], basicInfoColNames), 3, drop = FALSE])) %>>%
  60.     setnames(basicInfoColNames)
  61.   cbind(outDT, data.table(part2[ , 2:5, drop = FALSE]) %>>% setnames(transInfoColNames))
  62. }
  63. proc.time() - st
  64. # user  system elapsed
  65. # 9.33    3.14   16.86  
  66.  
  67. stopCluster(cl)
  68.  
  69. head(resDT, 10)
  70. #     customerid payment       ReadTime      transTime PruductNumber      price number
  71. #  1:          1       B 20170410235836 20170410235836             g 00087.0582     84
  72. #  2:          1       B 20170410235836 20170410235836             z 00109.8510     62
  73. #  3:          1       B 20170410235836 20170410235836             a 00113.2259     13
  74. #  4:          1       B 20170410235836 20170410235836             p 00124.6366     44
  75. #  5:          2       A 20170416151637 20170416151637             a 00127.0563     49
  76. #  6:          2       A 20170416151637 20170416151637             e 00102.5638     40
  77. #  7:          2       A 20170416151637 20170416151637             y 00117.9419      3
  78. #  8:          3       B 20170415064513 20170415064513             p 00053.0424     19
  79. #  9:          3       B 20170415064513 20170415064513             h 00059.5930     31
  80. # 10:          3       B 20170415064513 20170415064513             s 00087.3829     50
Advertisement
Add Comment
Please, Sign In to add comment