require(tidyverse) # Modern R
require(readxl) # Read Excel files
require(pdftools) # Read PDF files
require(quanteda) # Analyze text


#
# BUILD A CORPUS WITH THE FULL TEXTS

# Ingest page-by-page
pdfs = dir("papers_citavi", pattern="pdf$")
big <- list()
for (file in pdfs) {
  print(file)
  path = paste("papers_citavi", file, sep="/")
  tryCatch(
    expr = {
      content <- pdf_text(path) %>% paste(collapse=" ") %>% str_replace_all("\\s+", " ")
      big[[file]] <- content
    },
    warning = function(e) NULL,
    error = function(e) NULL
  )
}
rm(file, path, content, pdfs)
big <- data.frame(big) %>% stack %>% rename(text=values, file=ind)

# Save to disk
save(big,file="big.Rda")


#
# PREPARE THE TEXT ANALYSIS

sanitize <- function(item){
  item <- gsub(" [A-Z]\\.", "", item) # Abbreviated species
  item <- gsub("degrees C|° *C", "degrees_C ", item) # Temperatures
  item <- gsub("-", "_", item) # Hyphens
  item <- gsub("\\d+", "", item) # Numeric values
  item <- gsub("\\([^)]*\\)", "", item) # Parentheses
  item <- gsub("\\[[^)]*\\]", "", item) # Brackets
  item <- gsub("[[:punct:]]", "", item) # Symbols
  item <- gsub("'", "", item) # Apostrophes
  item <- gsub(" references.*$", " ", item, ignore.case=TRUE) # References
  item <- gsub(" \\S ", " ", item, ignore.case=TRUE) # Single characters
  item <- gsub(" \\w{2} ", " ", item, ignore.case=TRUE) # Very short words
  item <- gsub(" al ", " ", item, ignore.case=TRUE) # 'al' from 'et al.'
  return(item)
}

expressions <- phrase(c("sex ratio", "life cycle", "life history", "*ism rate",
                        "body size", "host size", "host quality", "host density",
                        "natural ennem*", "biological control", "mass rearing",
                        "foundress number", "brood size"
))

topics <- dictionary(list(
  sexratio = c("sex_ratio", "*male_biased"),
  temperature = c("degre*", "°*", "temper*", "warm*", "heat*"),
  humidity = c("humid*", "damp", "hygro*"),
  pressure = c("hPa", "bar", "atmos*"),
  diet = c("diet*", "nutri*", "food", "forag*"),
  structure = c("patch*", "habit*", "distan*", "locat*", "wind"),
  hosts = c("host_quality", "host_size"),
  dynamics = c("LMC", "compet*", "*ism_rate*", "life_histor*", "foundress*")
))


#
# ANALYZE THE CORPUS OF ABSTRACTS

smol <- read_excel('abstracts.xlsx') %>% as_tibble()

smol$Abstract <- sapply(smol$Abstract, sanitize)

smol <- as.data.frame(smol)

corp_abs <- corpus(smol$Abstract, docnames=smol$ID)
toks_abs <- tokens(corp_abs, remove_punct=FALSE)
toks_abs <- tokens_compound(toks_abs, expressions)

freq_abs <- dfm(toks_abs, 
                tolower=TRUE, 
                stem=FALSE, 
                remove_punct=TRUE, 
                remove=stopwords("english")
                )

counts_abs <- dfm_lookup(freq_abs, dictionary=topics)
counts_abs <- counts_abs %>% convert(to="data.frame") %>% as_tibble()

write.table(counts_abs, file="counts_abs.csv", row.names=FALSE, na="", sep=";")


#
# ANALYZE THE CORPUS OF FULL TEXTS

# Load from disk
load("big.Rda")
big <- big %>% as_tibble() %>% as.data.frame()

# Replace problematic parts
big$text <- sapply(big$text, sanitize)

# Create a corpus
corp_full <- corpus(big$text, docnames=big$file)

# Tokenize the text
toks_full <- tokens(corp_full)

# Compound expressions
toks_full <- tokens_compound(toks_full, expressions)

# Create a frequency matrix
freq_full <- dfm(toks_full, 
            tolower=TRUE, 
            stem=FALSE,
            remove_punct=TRUE, 
            remove=stopwords("english"))

# Remove short words and ones that occur rarely
freq_full <- dfm_trim(freq_full, min_nchar=3)
# freq_full <- dfm_trim(freq_full, min_termfreq=13)

# Combine plurals and singulars
# ...

# Check the most and least frequent words
# topfeatures(freq_full)
# tail(freq_full)

# Assign weights to tokens
# freq_full_weighted <- dfm_tfidf(freq_full)  # Weight the features using tf-idf

# Assign topics
counts_full <- dfm_lookup(freq_full, dictionary=topics)
counts_full <- counts_full %>% convert(to="data.frame") %>% as_tibble()
counts_full$doc_id <- factor(counts_full$doc_id)

# Export
write.table(counts_full, file="counts_full.csv", row.names=FALSE, na="", sep=";")


#
# REFERENCES

# Cite used packages
citation()
packages <- c('tidyverse', 'readxl', 'pdftools', 'quanteda')
for (p in packages) {
  print(citation(p))
}