Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #' Word count distribution from directory of literature
- #'
- #' This function accepts a folder of pdf journal articles and returns the
- #' frequency of scientifically relevant terms
- #'
- #' @param literatureFileDirectory directory of pdf files to count words from.
- #' @param omitStop - if TRUE omit stop words
- #' @param omitScientific - if TRUE omit words common to science not unique
- #' @param drawCloud - if TRUE draw word cloud
- #' @return TODO: figure this out
- #' @examples wordCountDistFromLitFolder(literatureFileDirectory)
- #'
- #'
- wordCountDistFromLitFolder = function (literatureFileDirectory = "",
- omitStop = TRUE,
- omitUselessTerms = TRUE,
- firstWordRankInPlot = 1,
- lastWordRankInPlot = 30,
- freqPlot = TRUE,
- uselessTerms = c("result", "fig", "tabl", "use", "research", "total",
- "can")
- ){
- library(pdftools)
- library(NLP)
- library(tm)
- library(tidytext)
- library(reshape2)
- library(ggplot2)
- if (!dir.exists(literatureFileDirectory)){
- stop("The specified directory does not exist")
- }
- setwd(literatureFileDirectory)
- literatureCorpus <-
- Corpus(URISource(list.files(pattern = "pdf$")),
- readerControl = list(reader = readPDF))
- #remove potentially problematic symbols
- toSpace <- content_transformer(function(x, pattern) {
- return (gsub(pattern, ' ', x))
- })
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '-')
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '’')
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '‘')
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '•')
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '”')
- literatureCorpus <- tm_map(literatureCorpus, toSpace, '“')
- tdm <- TermDocumentMatrix(
- literatureCorpus,
- control = list(removePunctuation = TRUE,
- stopwords = TRUE,
- tolower = TRUE,
- stemming = TRUE,
- removeNumbers = TRUE)
- )
- frequentTerms <- findFreqTerms(tdm, lowfreq = 1, highfreq = Inf)
- if (omitUselessTerms){
- frequentTerms <- frequentTerms[!(frequentTerms %in% uselessTerms)]
- }
- frequentTerms.tdm <- as.matrix(tdm[frequentTerms,])
- wordcount <- rowSums(frequentTerms.tdm)
- if (freqPlot){
- topWords <- sort(wordcount, decreasing=TRUE)[
- c(firstWordRankInPlot:lastWordRankInPlot)]
- dfplot <- as.data.frame(melt(topWords))
- dfplot$word <- dimnames(dfplot)[[1]]
- dfplot$word <-
- factor(dfplot$word,
- levels = dfplot$word[order(dfplot$value, decreasing=TRUE)])
- fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
- fig <- fig + xlab("Word in pdf files")
- fig <- fig + theme(axis.text.x = element_text(angle = 90))
- fig <- fig + ylab("Count")
- print(fig)
- }
- return(data.frame(word = frequentTerms, wordcount = wordcount))
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement