Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(data.table)
- library(pipeR)
- library(stringr)
- library(iterators)
- library(foreach)
- library(doParallel)
- # generate data
- set.seed(100) # for reproducing
- numCustomer <- 1e4L
- numPurchase <- sample.int(10L, numCustomer, TRUE)
- out <- data.table(id = 1L:numCustomer, payment = sample(LETTERS[1:3], numCustomer, TRUE),
- readTime = Sys.time() - sample.int(86400*10, numCustomer, TRUE),
- items = lapply(numPurchase, function(x){
- data.table(PN = sample(letters, x), price = abs(rnorm(x, 100, 20)),
- number = sample.int(100, x))
- }))
- if (file.exists("transRecord.txt"))
- unlink("transRecord.txt")
- st <- proc.time()
- fileConn <- file("transRecord.txt", "w+")
- foreach(it = iter(out, by = "row")) %do%
- {
- timeStr <- format(it$readTime, "%Y%m%d%H%M%S")
- cat(sprintf("customerid: %i\npayment: %s\nReadTime: %s\n",
- it$id, it$payment, timeStr), file = fileConn, append = TRUE)
- foreach(it2 = iter(it$items, by = "row")) %do%
- {
- cat(sprintf("%sPruductNumber: %s: NT%010.4fNumber: %i\n",
- timeStr, it2$PN, it2$price, it2$number), file = fileConn, append = TRUE)
- }
- } %>>% invisible
- close(fileConn)
- proc.time() - st
- # user system elapsed
- # 39.50 0.14 40.11
- file.size("transRecord.txt") / 2^20 # 3.508 MB
- # start parsing
- cl <- makeCluster(detectCores() - 1L)
- registerDoParallel(cl)
- lines <- readLines("transRecord.txt")
- basicInfoColNames <- c("customerid", "payment", "ReadTime")
- transInfoColNames <- c("transTime", "PruductNumber", "price", "number")
- st <- proc.time()
- it <- isplit(lines, cumsum(str_detect(lines, "^customerid")))
- resDT <- foreach(it = it, .combine = rbind, .multicombine = TRUE,
- .packages = c("data.table", "stringr", "pipeR")) %dopar%
- {
- idx <- which(str_detect(it$value, "^ReadTime"))
- part1 <- str_match(it$value[1:idx], "(customerid|payment|ReadTime): (.*)")
- part2 <- str_match(it$value[(idx + 1):length(it$value)],
- "(\\d{14})PruductNumber: ([a-zA-Z0-9]+): NT(\\d{5}\\.\\d{4})Number: (\\d+)")
- outDT <- data.table(t(part1[match(part1[ , 2], basicInfoColNames), 3, drop = FALSE])) %>>%
- setnames(basicInfoColNames)
- cbind(outDT, data.table(part2[ , 2:5, drop = FALSE]) %>>% setnames(transInfoColNames))
- }
- proc.time() - st
- # user system elapsed
- # 9.33 3.14 16.86
- stopCluster(cl)
- head(resDT, 10)
- # customerid payment ReadTime transTime PruductNumber price number
- # 1: 1 B 20170410235836 20170410235836 g 00087.0582 84
- # 2: 1 B 20170410235836 20170410235836 z 00109.8510 62
- # 3: 1 B 20170410235836 20170410235836 a 00113.2259 13
- # 4: 1 B 20170410235836 20170410235836 p 00124.6366 44
- # 5: 2 A 20170416151637 20170416151637 a 00127.0563 49
- # 6: 2 A 20170416151637 20170416151637 e 00102.5638 40
- # 7: 2 A 20170416151637 20170416151637 y 00117.9419 3
- # 8: 3 B 20170415064513 20170415064513 p 00053.0424 19
- # 9: 3 B 20170415064513 20170415064513 h 00059.5930 31
- # 10: 3 B 20170415064513 20170415064513 s 00087.3829 50
Advertisement
Add Comment
Please, Sign In to add comment