Advertisement
Guest User

Untitled

a guest
May 26th, 2016
275
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.75 KB | None | 0 0
  1. install.packages("dplyr")
  2. install.packages("tm")
  3. install.packages("tidyr")
  4. install.packages("stringr")
  5. require(tm)
  6. require(dplyr)
  7. require(tidyr)
  8. require(stringr)
  9. # download pdftotxt from
  10. # ftp://ftp.foolabs.com/pub/xpdf/xpdfbin-win-3.03.zip
  11. # exe <- "C:/Program Files (x86)/xpdfbin-win-3.04/bin64/pdftotext.exe"
  12. ## I converted the .pdf to a .txt and then made a separate textfile with just the cypher/artifact chunk
  13. # pdf <- "C:/Users/NS/Documents/Numenera/Numenera-Ninth_World_Guidebook.pdf"
  14. # exe <- "C:/Program Files (x86)/xpdfbin-win-3.04/bin64/pdftotext.exe"
  15. # system(paste("\"", exe, "\" \"", pdf, "\"", sep = ""), wait = F)
  16.  
  17. ## Read in the textfile containing just the chunk of the document that listed cyphers
  18. ## I just copied and pasted in notepad to cut the .txt down to the listings of cyphers
  19. cyphers.arthour <- "C:/Users/NS/Documents/Numenera/cyphers.txt" %>% readLines()
  20.  
  21. ## Remove all the cruft lines that don't contain information about the cyphers
  22. cyphers.arthour <- cyphers.arthour[cyphers.arthour != ""]
  23. cyphers.arthour <- cyphers.arthour[!grepl("TECHNOLOGY COMPENDIUM", cyphers.arthour)]
  24. cyphers.arthour <- cyphers.arthour[!grepl("SIR ARTHOUR\'S GUIDE TO THE NUMENERA CYPHERS", cyphers.arthour)]
  25. cyphers.arthour <- cyphers.arthour[!grepl("Search terms:", cyphers.arthour)]
  26. cyphers.arthour <- cyphers.arthour[is.na(cyphers.arthour %>% as.numeric())]
  27.  
  28. ## Start a data table with these things and prime the id number column because each cypher is getting an id
  29. cyphers.raw <- data.frame(cyphers.arthour, stringsAsFactors = F)
  30. cyphers.raw$id <- NA
  31.  
  32. ## Go through and any line that has "Level:" in it gets an id number (the row number it occurs in) because
  33. ## every cypher has that line and it's easy to identify. The previous line should always be the cypher name, so
  34. ## it gets the same number
  35. for(n in 1:nrow(cyphers.raw)){
  36. if (grepl("Level:", cyphers.raw$cyphers.arthour[n])){
  37. cyphers.raw$id[n] <- n
  38. cyphers.raw$id[n-1] <- n
  39. }
  40. }
  41.  
  42. ## This just takes all the other lines and gives them the id of the previous line that had an id because they probably belong
  43. ## that cypher (when there're tables of detonation effects and things) but need some kind of home regardless
  44. for(n in 1:nrow(cyphers.raw)){
  45. if (is.na(cyphers.raw$id[n])){
  46. cyphers.raw$id[n] <- {
  47. cyphers.raw$id[cyphers.raw$id < n] %>% na.omit() %>% as.vector %>% last()
  48. }
  49. }
  50. }
  51.  
  52. ## Making it tidy. Each id number gets a row with a column for each line associated with it. Yes, it's a mess, but it works
  53. cyphers.tidier <- cyphers.raw %>%
  54. group_by(id) %>%
  55. summarize(
  56. line.1 = nth(cyphers.arthour, 1),
  57. line.2 = nth(cyphers.arthour, 2),
  58. line.3 = nth(cyphers.arthour, 3),
  59. line.4 = nth(cyphers.arthour, 4),
  60. line.5 = nth(cyphers.arthour, 5),
  61. line.6 = nth(cyphers.arthour, 6),
  62. line.7 = nth(cyphers.arthour, 7),
  63. line.8 = nth(cyphers.arthour, 8),
  64. line.9 = nth(cyphers.arthour, 9),
  65. line.10 = nth(cyphers.arthour, 10),
  66. line.11 = nth(cyphers.arthour, 11),
  67. line.12 = nth(cyphers.arthour, 12),
  68. line.13 = nth(cyphers.arthour, 13),
  69. line.14 = nth(cyphers.arthour, 14),
  70. line.15 = nth(cyphers.arthour, 15),
  71. line.16 = nth(cyphers.arthour, 16),
  72. line.17 = nth(cyphers.arthour, 17),
  73. line.18 = nth(cyphers.arthour, 18),
  74. line.19 = nth(cyphers.arthour, 19),
  75. line.20 = nth(cyphers.arthour, 20)
  76. )
  77.  
  78. ## Honestly, I don't know if these are still necessary as primers
  79. cyphers.tidier$wearable <- NA
  80. cyphers.tidier$usable <- NA
  81. cyphers.tidier$internal <- NA
  82. cyphers.tidier$effect <- NA
  83.  
  84. ## The real shit. This grabs the qualities of the cypher and writes them into a column that matches that name
  85. for (n in 1:nrow(cyphers.tidier)){
  86. ## Line 2 is where things like the level and effect can be found usually, so this makes a vector of strings, each representing
  87. ## a word
  88. contents <- str_split(cyphers.tidier$line.2[n], " ") %>% as.vector()
  89. contents <- contents[[1]]
  90. ## Identifying the locations of things like "Level:" and "Effect:" in the vector
  91. header.locs <- grep(":", contents)
  92. ## Getting the names of those qualities, so I have both the quality and the location in the vector
  93. header.names <- contents[header.locs] %>% str_replace_all(":", "") %>% str_to_lower()
  94. ## I know that they always list level first, so this creates a string from all the strings in the vector located between
  95. ## "Level:" and the next quality, whatever it might be
  96. cyphers.tidier$level[n] <- paste(contents[2:(header.locs[2] - 1)], collapse = " ")
  97. ## This goes through the rest of the qualities in identified by the grep() above and writes them into columns using their
  98. ## names. There are a lot of weird false positives here because the .pdf wasnt formatted for this use
  99. for (x in 2:length(header.locs)){
  100. if (x < length(header.locs)){
  101. cyphers.tidier[n, header.names[x]] <- paste(contents[(header.locs[x] + 1):(header.locs[x + 1] - 1)], collapse = " ")
  102. } else {
  103. cyphers.tidier[n, header.names[x]] <- paste(contents[(header.locs[x] + 1):length(contents)], collapse = " ")
  104. }
  105. }
  106. }
  107.  
  108. ## So, there were lots of lines beyond the one that starts with "Level:" that sometimes have relevant information
  109. ## This takes all the other lines and combines them into one string, stripping out all the NAs and trimming whitespace
  110. for (n in 1:nrow(cyphers.tidier)){
  111. cyphers.tidier$additional[n] <- paste(cyphers.tidier[n, 4:21], collapse = " ") %>% gsub("NA", "", .) %>% str_trim()
  112. }
  113.  
  114. ## Maybe I should have a name field?
  115. cyphers.tidier$name <- cyphers.tidier$line.1
  116.  
  117. ## Put everything into a nice, clean data frame
  118. cyphers <- cyphers.tidier[,c("name", "level", "usable", "wearable", "internal", "effect", "additional")]
  119.  
  120. write.csv(cyphers, "C:/Users/NS/Documents/Numenera/cypher_table_uncorrected.csv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement