Guest User

Untitled

a guest
May 23rd, 2018
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.72 KB | None | 0 0
  1. # The MIT License (MIT)
  2. #
  3. # Copyright (c) 2012 Schaun Jacob Wheeler
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in all
  13. # copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. # SOFTWARE.
  22.  
  23. # I've modified this for my purposes. - DM
  24.  
  25.  
  26. library(XML)
  27. library(plyr)
  28. library(pbapply)
  29.  
  30. xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) {
  31.  
  32. temp_dir <- file.path(tempdir(), "xlsxToRtemp")
  33. suppressWarnings(dir.create(temp_dir))
  34.  
  35. file.copy(file, temp_dir)
  36. new_file <- list.files(temp_dir, full.name = TRUE, pattern = basename(file))
  37. unzip(new_file, exdir = temp_dir)
  38.  
  39. # Get names of sheets
  40. sheet_names <- xmlToList(xmlParse(list.files(
  41. paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "workbook.xml")))
  42. sheet_names <- rbind.fill(lapply(sheet_names$sheets, function(x) {
  43. as.data.frame(as.list(x), stringsAsFactors = FALSE)
  44. }))
  45. rownames(sheet_names) <- NULL
  46. sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE)
  47. sheet_names$id <- gsub("\\D", "", sheet_names$id)
  48.  
  49. # Get column classes
  50. styles <- xmlParse(list.files(
  51. paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml"))
  52. styles <- xpathApply(styles, "//x:xf[@applyNumberFormat and @numFmtId]",
  53. namespaces = "x", xmlAttrs)
  54. styles <- lapply(styles, function(x) {
  55. x[grepl("applyNumberFormat|numFmtId", names(x))]})
  56. styles <- do.call("rbind", (lapply(styles,
  57. function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]),
  58. stringsAsFactors = FALSE))))
  59.  
  60. if(!is.null(keep_sheets)) {
  61. sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,]
  62.  
  63. }
  64.  
  65. worksheet_paths <- list.files(
  66. paste0(temp_dir, "/xl/worksheets"),
  67. full.name = TRUE,
  68. pattern = paste0(
  69. "sheet(",
  70. paste(sheet_names$id, collapse = "|"),
  71. ")\\.xml$"))
  72.  
  73. worksheets <- lapply(worksheet_paths, function(x) xmlRoot(xmlParse(x))[["sheetData"]])
  74.  
  75. worksheets <- pblapply(seq_along(worksheets), function(i) {
  76.  
  77. x <- xpathApply(worksheets[[i]], "//x:c", namespaces = "x", function(node) {
  78. c("v" = xmlValue(node[["v"]]), xmlAttrs(node))
  79. })
  80.  
  81. if(length(x) > 0) {
  82.  
  83. x_rows <- unlist(lapply(seq_along(x), function(i) rep(i, length(x[[i]]))))
  84. x <- unlist(x)
  85.  
  86. x <- reshape(
  87. data.frame(
  88. "row" = x_rows,
  89. "ind" = names(x),
  90. "value" = x,
  91. stringsAsFactors = FALSE),
  92. idvar = "row", timevar = "ind", direction = "wide")
  93.  
  94. x$sheet <- sheet_names[sheet_names$id == i, "name"]
  95. colnames(x) <- gsub("^value\\.", "", colnames(x))
  96. }
  97. x
  98. })
  99. worksheets <- do.call("rbind.fill",
  100. worksheets[sapply(worksheets, class) == "data.frame"])
  101.  
  102. entries <- xmlParse(list.files(paste0(temp_dir, "/xl"), full.name = TRUE,
  103. pattern = "sharedStrings.xml$"))
  104. entries <- xpathSApply(entries, "//x:si", namespaces = "x", xmlValue)
  105. names(entries) <- seq_along(entries) - 1
  106.  
  107. entries_match <- entries[
  108. match(worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)],
  109. names(entries))]
  110. worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <- entries_match
  111. worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS)
  112. worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r))
  113.  
  114. if(!any(grepl("^s$", colnames(worksheets)))) {
  115. worksheets$s <- NA
  116. }
  117.  
  118. workbook <- lapply(unique(worksheets$sheet), function(x) {
  119. y <- worksheets[worksheets$sheet == x,]
  120. y_style <- as.data.frame(tapply(y$s, list(y$rows, y$cols), identity),
  121. stringsAsFactors = FALSE)
  122. y <- as.data.frame(tapply(y$v, list(y$rows, y$cols), identity),
  123. stringsAsFactors = FALSE)
  124.  
  125. if(header) {
  126. colnames(y) <- y[1,]
  127. y <- y[-1,]
  128. y_style <- y_style[-1,]
  129. }
  130.  
  131. y_style <- sapply(y_style, function(x) {
  132. out <- names(which.max(table(x)))
  133. out[is.null(out)] <- NA
  134. out
  135. })
  136.  
  137. if(length(styles) > 0) {
  138. y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)]
  139. }
  140.  
  141. y_style[y_style %in% 14:17] <- "date"
  142. y_style[y_style %in% c(18:21, 45:47)] <- "time"
  143. y_style[y_style %in% 22] <- "datetime"
  144. y_style[is.na(y_style) & !sapply(y, function(x)any(grepl("\\D", x)))] <- "numeric"
  145. y_style[is.na(y_style)] <- "character"
  146. y_style[!(y_style %in% c("date", "time", "datetime", "numeric"))] <- "character"
  147.  
  148. y[] <- lapply(seq_along(y), function(i) {
  149. switch(y_style[i],
  150. character = y[,i],
  151. numeric = as.numeric(y[,i]),
  152. date = as.Date(as.numeric(y[,i]), origin = os_origin),
  153. time = strftime(as.POSIXct(as.numeric(y[,i]), origin = os_origin), format = "%H:%M:%S"),
  154. datetime = as.POSIXct(as.numeric(y[,i]), origin = os_origin))
  155. })
  156. y
  157. })
  158.  
  159. if(length(workbook) == 1) {
  160. workbook <- workbook[[1]]
  161. } else {
  162. names(workbook) <- sheet_names$name
  163. }
  164.  
  165. workbook
  166. }
Add Comment
Please, Sign In to add comment