Advertisement
Guest User

Untitled

a guest
Aug 9th, 2016
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.49 KB | None | 0 0
  1. ## Set working directory
  2. setwd(file.path("G:", "projects", "serve-interactive-maps-beta"))
  3.  
  4. ## Import packages
  5. library(data.table)
  6. library(digest)
  7. library(dplyr)
  8. library(httr)
  9. library(jsonlite)
  10. library(magrittr)
  11. library(parallel)
  12. library(rmongodb)
  13. library(stringr)
  14.  
  15. ## Set R options for convenience
  16. options(digits = 10)
  17. options(width = 512)
  18. options(warn = 0)
  19. Sys.setenv(R_LIBS_USER = "C:/R/library")
  20. .libPaths("C:/R/library")
  21. source("crypt.R")
  22. load("myrkey.RData")
  23. credentials <- read.aes(filename = "credentials.txt", key = key)
  24.  
  25. ## Declare external functions to be used
  26. Comment <- function(`@Comments`) { invisible() }
  27.  
  28. if (FALSE) {
  29. set_config(
  30. use_proxy(url = credentials$url,
  31. port = credentials$port,
  32. username = credentials$username,
  33. password = credentials$password)
  34. )
  35. }
  36.  
  37. if (FALSE) {
  38. ## Install `rmongodb` if necessary
  39. library(devtools)
  40. install_github(repo = "mongosoup/rmongodb")
  41. }
  42.  
  43. ## Connect to localhost
  44. mymongo <- mongo.create(db="science")
  45.  
  46. ## Check for working connection
  47. if (mongo.is.connected(mymongo) == TRUE) {
  48. ## Get the list of input filenames
  49. files <- list.files(path = "json",
  50. pattern = "json_houjinkai_*",
  51. full.names = TRUE)
  52.  
  53. ## Insert data into mongodb
  54. t1 <- Sys.time()
  55. for (n in 1:length(files)) {
  56. ## Read from text file
  57. doc <- jsonlite::fromJSON(txt = files[n])
  58.  
  59. ## PARALLEL-BEGIN
  60. cl <- makeCluster(4)
  61. clusterExport(cl=cl, varlist=c("credentials"), envir=environment())
  62.  
  63. # md5(producerid)
  64. doc$md5_pid <- parSapply(cl, doc$producerid, function(x) {
  65. library(digest)
  66. md5_pid <- digest::digest(object = x,
  67. algo = "md5",
  68. seed = credentials$myseed,
  69. errormode = "warn")
  70. return (md5_pid)
  71. })
  72.  
  73. # md5(companyid)
  74. doc$md5_cid <- parSapply(cl, doc$companyid, function(x) {
  75. library(digest)
  76. md5_cid <- digest::digest(object = x,
  77. algo = "md5",
  78. seed = 123,
  79. errormode = "warn")
  80. return (md5_cid)
  81. })
  82.  
  83. # md5(company)
  84. doc$md5_cmp <- parSapply(cl, doc$company, function(x) {
  85. library(digest)
  86. md5_cmp <- digest::digest(object = x,
  87. algo = "md5",
  88. seed = 123,
  89. errormode = "warn")
  90. return (md5_cmp)
  91. })
  92.  
  93. if("houjinkai" %in% names(doc)) {
  94. # md5(houjinkai)
  95. doc$md5_hou <- parSapply(cl, doc$houjinkai, function(x) {
  96. library(digest)
  97. md5_hou <- digest::digest(object = x,
  98. algo = "md5",
  99. seed = 123,
  100. errormode = "warn")
  101. return (md5_hou)
  102. })
  103. doc$md5_hou[ is.na(doc$houjinkai) ] <- NA
  104. doc <- doc[, c("md5_pid",
  105. "md5_cid",
  106. "md5_cmp",
  107. "md5_hou",
  108. "address",
  109. "lat",
  110. "long",
  111. "depth")]
  112. }
  113.  
  114. if("nouzeikyoukai" %in% names(doc)) {
  115. # md5(nouzeikyoukai)
  116. doc$md5_nou <- parSapply(cl, doc$nouzeikyoukai, function(x) {
  117. library(digest)
  118. md5_nou <- digest::digest(object = x,
  119. algo = "md5",
  120. seed = 123,
  121. errormode = "warn")
  122. return (md5_nou)
  123. })
  124. doc$md5_nou[ is.na(doc$nouzeikyoukai) ] <- NA
  125. doc <- doc[, c("md5_pid",
  126. "md5_cid",
  127. "md5_cmp",
  128. "md5_nou",
  129. "address",
  130. "lat",
  131. "long",
  132. "depth")]
  133. }
  134.  
  135. ## Convert original dataframe to a list of lists,
  136. ## where each row is a list
  137. lst_doc <- parLapply(cl, split(doc, seq_along(doc[,1])), function(x) {
  138. x <- as.list(x)
  139. x[!is.na(x)]
  140. })
  141.  
  142. ## Convert the list-of-lists to an BSON object
  143. doc <- parLapply(cl, lst_doc, function(x) mongo.bson.from.list(x))
  144.  
  145. stopCluster(cl)
  146. ## PARALLEL-END
  147.  
  148. ## Insert the BSON object into mongodb
  149. mongo.insert.batch(mongo = mymongo,
  150. ns = "science.houjinkai",
  151. lst = doc)
  152.  
  153. ## Print the processed filename
  154. print(files[n])
  155. flush.console()
  156. }
  157. t2 <- Sys.time()
  158. td <- as.numeric(as.difftime(t2-t1), units = "secs")
  159. msg <- paste0("[insert data into mongodb] took ", format(td), " secs")
  160. print(msg)
  161. flush.console()
  162. rm(n, doc, lst_doc, files, t1, t2, td, msg)
  163.  
  164. ##
  165. if (FALSE) {
  166. if(mongo.is.connected(mymongo) == TRUE){
  167. # mongo.drop(mymongo, "science.houjinkai")
  168.  
  169. # nr <- mongo.count(mymongo, ns="science.houjinkai")
  170. # print(nr)
  171.  
  172. t1 <- Sys.time()
  173. res <- mongo.find.all(mymongo, "science.houjinkai") %>%
  174. lapply(., function(x) { as.data.table(x) }) %>%
  175. dplyr::bind_rows(.)
  176.  
  177. Encoding(res$companyid) <- "UTF-8"
  178. Encoding(res$company) <- "UTF-8"
  179. Encoding(res$address) <- "UTF-8"
  180. Encoding(res$houjinkai) <- "UTF-8"
  181. Encoding(res$nouzeikyoukai) <- "UTF-8"
  182.  
  183. t2 <- Sys.time()
  184. td <- as.numeric(as.difftime(t2-t1), units = "secs")
  185. msg <- paste0("[get data from mongodb] took ", format(td), " secs")
  186. print(msg)
  187. flush.console()
  188. }
  189. }
  190.  
  191. ## Disconnect from mongoDB
  192. mongo.destroy(mymongo)
  193. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement