Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(tidyverse)
- test <- pdftools::pdf_text(pdf = "silewp2006_005.pdf")
- df <- unlist(sapply(6:49, function(i){str_split(test[i], "\n")}))
- data_frame(text = df) %>%
- mutate(str = str_extract(text, "\\d{4}"),
- text = if_else(is.na(str), text, str_replace_all(text, str, paste0(str, " "))),
- text = str_replace_all(text, "\\s{3,}", " ")) %>%
- select(-str) %>%
- filter(!str_detect(text, "^ ")) %>%
- rowwise() %>%
- mutate(number = unlist(str_split(text, " "))[1],
- english = unlist(str_split(text, " "))[2],
- french = unlist(str_split(text, " "))[3],
- chapter = if_else(str_detect(number, "\\."), "section", "word")) %>%
- filter(number != "") %>%
- select(-text) ->
- df
- chapt_lines <- c(str_which(df$chapter, "section")[-1]-1, nrow(df))
- chapt_length <- chapt_lines - c(0, chapt_lines[-length(chapt_lines)])
- df$section <- rep(df$number[str_which(df$chapter, "section")], chapt_length)
- write_csv(df, "result.csv")
Add Comment
Please, Sign In to add comment