require(tidyverse) # Modern R require(readxl) # Read Excel files require(pdftools) # Read PDF files require(quanteda) # Analyze text # # BUILD A CORPUS WITH THE FULL TEXTS # Ingest page-by-page pdfs = dir("papers_citavi", pattern="pdf$") big <- list() for (file in pdfs) { print(file) path = paste("papers_citavi", file, sep="/") tryCatch( expr = { content <- pdf_text(path) %>% paste(collapse=" ") %>% str_replace_all("\\s+", " ") big[[file]] <- content }, warning = function(e) NULL, error = function(e) NULL ) } rm(file, path, content, pdfs) big <- data.frame(big) %>% stack %>% rename(text=values, file=ind) # Save to disk save(big,file="big.Rda") # # PREPARE THE TEXT ANALYSIS sanitize <- function(item){ item <- gsub(" [A-Z]\\.", "", item) # Abbreviated species item <- gsub("degrees C|° *C", "degrees_C ", item) # Temperatures item <- gsub("-", "_", item) # Hyphens item <- gsub("\\d+", "", item) # Numeric values item <- gsub("\\([^)]*\\)", "", item) # Parentheses item <- gsub("\\[[^)]*\\]", "", item) # Brackets item <- gsub("[[:punct:]]", "", item) # Symbols item <- gsub("'", "", item) # Apostrophes item <- gsub(" references.*$", " ", item, ignore.case=TRUE) # References item <- gsub(" \\S ", " ", item, ignore.case=TRUE) # Single characters item <- gsub(" \\w{2} ", " ", item, ignore.case=TRUE) # Very short words item <- gsub(" al ", " ", item, ignore.case=TRUE) # 'al' from 'et al.' return(item) } expressions <- phrase(c("sex ratio", "life cycle", "life history", "*ism rate", "body size", "host size", "host quality", "host density", "natural ennem*", "biological control", "mass rearing", "foundress number", "brood size" )) topics <- dictionary(list( sexratio = c("sex_ratio", "*male_biased"), temperature = c("degre*", "°*", "temper*", "warm*", "heat*"), humidity = c("humid*", "damp", "hygro*"), pressure = c("hPa", "bar", "atmos*"), diet = c("diet*", "nutri*", "food", "forag*"), structure = c("patch*", "habit*", "distan*", "locat*", "wind"), hosts = c("host_quality", "host_size"), dynamics = c("LMC", "compet*", "*ism_rate*", "life_histor*", "foundress*") )) # # ANALYZE THE CORPUS OF ABSTRACTS smol <- read_excel('abstracts.xlsx') %>% as_tibble() smol$Abstract <- sapply(smol$Abstract, sanitize) smol <- as.data.frame(smol) corp_abs <- corpus(smol$Abstract, docnames=smol$ID) toks_abs <- tokens(corp_abs, remove_punct=FALSE) toks_abs <- tokens_compound(toks_abs, expressions) freq_abs <- dfm(toks_abs, tolower=TRUE, stem=FALSE, remove_punct=TRUE, remove=stopwords("english") ) counts_abs <- dfm_lookup(freq_abs, dictionary=topics) counts_abs <- counts_abs %>% convert(to="data.frame") %>% as_tibble() write.table(counts_abs, file="counts_abs.csv", row.names=FALSE, na="", sep=";") # # ANALYZE THE CORPUS OF FULL TEXTS # Load from disk load("big.Rda") big <- big %>% as_tibble() %>% as.data.frame() # Replace problematic parts big$text <- sapply(big$text, sanitize) # Create a corpus corp_full <- corpus(big$text, docnames=big$file) # Tokenize the text toks_full <- tokens(corp_full) # Compound expressions toks_full <- tokens_compound(toks_full, expressions) # Create a frequency matrix freq_full <- dfm(toks_full, tolower=TRUE, stem=FALSE, remove_punct=TRUE, remove=stopwords("english")) # Remove short words and ones that occur rarely freq_full <- dfm_trim(freq_full, min_nchar=3) # freq_full <- dfm_trim(freq_full, min_termfreq=13) # Combine plurals and singulars # ... # Check the most and least frequent words # topfeatures(freq_full) # tail(freq_full) # Assign weights to tokens # freq_full_weighted <- dfm_tfidf(freq_full) # Weight the features using tf-idf # Assign topics counts_full <- dfm_lookup(freq_full, dictionary=topics) counts_full <- counts_full %>% convert(to="data.frame") %>% as_tibble() counts_full$doc_id <- factor(counts_full$doc_id) # Export write.table(counts_full, file="counts_full.csv", row.names=FALSE, na="", sep=";") # # REFERENCES # Cite used packages citation() packages <- c('tidyverse', 'readxl', 'pdftools', 'quanteda') for (p in packages) { print(citation(p)) }