Guest User

Untitled

a guest
Dec 16th, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.94 KB | None | 0 0
  1. library(tidyverse)
  2.  
  3. test <- pdftools::pdf_text(pdf = "silewp2006_005.pdf")
  4. df <- unlist(sapply(6:49, function(i){str_split(test[i], "\n")}))
  5. data_frame(text = df) %>%
  6. mutate(str = str_extract(text, "\\d{4}"),
  7. text = if_else(is.na(str), text, str_replace_all(text, str, paste0(str, " "))),
  8. text = str_replace_all(text, "\\s{3,}", " ")) %>%
  9. select(-str) %>%
  10. filter(!str_detect(text, "^ ")) %>%
  11. rowwise() %>%
  12. mutate(number = unlist(str_split(text, " "))[1],
  13. english = unlist(str_split(text, " "))[2],
  14. french = unlist(str_split(text, " "))[3],
  15. chapter = if_else(str_detect(number, "\\."), "section", "word")) %>%
  16. filter(number != "") %>%
  17. select(-text) ->
  18. df
  19.  
  20.  
  21. chapt_lines <- c(str_which(df$chapter, "section")[-1]-1, nrow(df))
  22. chapt_length <- chapt_lines - c(0, chapt_lines[-length(chapt_lines)])
  23. df$section <- rep(df$number[str_which(df$chapter, "section")], chapt_length)
  24.  
  25. write_csv(df, "result.csv")
Add Comment
Please, Sign In to add comment