Advertisement
Guest User

Untitled

a guest
Jul 19th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.25 KB | None | 0 0
  1. #' Word count distribution from directory of literature
  2. #'
  3. #' This function accepts a folder of pdf journal articles and returns the
  4. #' frequency of scientifically relevant terms
  5. #'
  6. #' @param literatureFileDirectory directory of pdf files to count words from.
  7. #' @param omitStop - if TRUE omit stop words
  8. #' @param omitScientific - if TRUE omit words common to science not unique
  9. #' @param drawCloud - if TRUE draw word cloud
  10. #' @return TODO: figure this out
  11. #' @examples wordCountDistFromLitFolder(literatureFileDirectory)
  12. #'
  13. #'
  14.  
  15. wordCountDistFromLitFolder = function (literatureFileDirectory = "",
  16. omitStop = TRUE,
  17. omitUselessTerms = TRUE,
  18. firstWordRankInPlot = 1,
  19. lastWordRankInPlot = 30,
  20. freqPlot = TRUE,
  21. uselessTerms = c("result", "fig", "tabl", "use", "research", "total",
  22. "can")
  23. ){
  24. library(pdftools)
  25. library(NLP)
  26. library(tm)
  27. library(tidytext)
  28. library(reshape2)
  29. library(ggplot2)
  30.  
  31. if (!dir.exists(literatureFileDirectory)){
  32. stop("The specified directory does not exist")
  33. }
  34.  
  35. setwd(literatureFileDirectory)
  36.  
  37. literatureCorpus <-
  38. Corpus(URISource(list.files(pattern = "pdf$")),
  39. readerControl = list(reader = readPDF))
  40.  
  41. #remove potentially problematic symbols
  42. toSpace <- content_transformer(function(x, pattern) {
  43. return (gsub(pattern, ' ', x))
  44. })
  45. literatureCorpus <- tm_map(literatureCorpus, toSpace, '-')
  46. literatureCorpus <- tm_map(literatureCorpus, toSpace, '’')
  47. literatureCorpus <- tm_map(literatureCorpus, toSpace, '‘')
  48. literatureCorpus <- tm_map(literatureCorpus, toSpace, '•')
  49. literatureCorpus <- tm_map(literatureCorpus, toSpace, '”')
  50. literatureCorpus <- tm_map(literatureCorpus, toSpace, '“')
  51.  
  52. tdm <- TermDocumentMatrix(
  53. literatureCorpus,
  54. control = list(removePunctuation = TRUE,
  55. stopwords = TRUE,
  56. tolower = TRUE,
  57. stemming = TRUE,
  58. removeNumbers = TRUE)
  59. )
  60.  
  61. frequentTerms <- findFreqTerms(tdm, lowfreq = 1, highfreq = Inf)
  62.  
  63. if (omitUselessTerms){
  64. frequentTerms <- frequentTerms[!(frequentTerms %in% uselessTerms)]
  65. }
  66.  
  67. frequentTerms.tdm <- as.matrix(tdm[frequentTerms,])
  68.  
  69. wordcount <- rowSums(frequentTerms.tdm)
  70.  
  71. if (freqPlot){
  72.  
  73. topWords <- sort(wordcount, decreasing=TRUE)[
  74. c(firstWordRankInPlot:lastWordRankInPlot)]
  75.  
  76. dfplot <- as.data.frame(melt(topWords))
  77. dfplot$word <- dimnames(dfplot)[[1]]
  78. dfplot$word <-
  79. factor(dfplot$word,
  80. levels = dfplot$word[order(dfplot$value, decreasing=TRUE)])
  81.  
  82. fig <- ggplot(dfplot, aes(x=word, y=value)) + geom_bar(stat="identity")
  83. fig <- fig + xlab("Word in pdf files")
  84. fig <- fig + theme(axis.text.x = element_text(angle = 90))
  85. fig <- fig + ylab("Count")
  86. print(fig)
  87. }
  88.  
  89. return(data.frame(word = frequentTerms, wordcount = wordcount))
  90. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement