SHARE
TWEET

Untitled

a guest Dec 16th, 2018 60 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. library(tidyverse)
  2.  
  3. test <- pdftools::pdf_text(pdf = "silewp2006_005.pdf")
  4. df <- unlist(sapply(6:49, function(i){str_split(test[i], "\n")}))
  5. data_frame(text = df) %>%
  6.   mutate(str = str_extract(text, "\\d{4}"),
  7.          text = if_else(is.na(str), text, str_replace_all(text, str, paste0(str, " "))),
  8.          text = str_replace_all(text, "\\s{3,}", "  ")) %>%
  9.   select(-str) %>%
  10.   filter(!str_detect(text, "^ ")) %>%
  11.   rowwise() %>%
  12.   mutate(number = unlist(str_split(text, " "))[1],
  13.          english = unlist(str_split(text, "  "))[2],
  14.          french = unlist(str_split(text, "  "))[3],
  15.          chapter = if_else(str_detect(number, "\\."), "section", "word")) %>%
  16.   filter(number != "") %>%
  17.   select(-text) ->
  18.   df
  19.  
  20.  
  21. chapt_lines <- c(str_which(df$chapter, "section")[-1]-1, nrow(df))
  22. chapt_length <- chapt_lines - c(0, chapt_lines[-length(chapt_lines)])
  23. df$section <- rep(df$number[str_which(df$chapter, "section")], chapt_length)
  24.  
  25. write_csv(df, "result.csv")
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top