Advertisement
Guest User

Untitled

a guest
Oct 16th, 2019
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.23 KB | None | 0 0
  1. ---
  2. title: "Text mining"
  3. output: html_notebook
  4. ---
  5.  
  6. #https://rpubs.com/pjmurphy/265713
  7.  
  8. Install packages
  9. ```{r}
  10. Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust",
  11. "cluster", "igraph", "fpc")
  12. install.packages(Needed, dependencies = TRUE)
  13. ```
  14.  
  15. Load data
  16.  
  17. ```{r}
  18. install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
  19.  
  20. cname <- file.path("C:\\Users\\dstudent\\Documents\\texts")
  21. cname
  22. dir(cname)
  23. ```
  24.  
  25. First preprocess data
  26. ```{r}
  27. library(tm)
  28.  
  29. docs <- VCorpus(DirSource(cname))
  30. summary(docs)
  31.  
  32. inspect(docs[1])
  33.  
  34. writeLines(as.character(docs[1]))
  35. docs <- tm_map(docs,removePunctuation)
  36.  
  37. writeLines(as.character(docs[1])) # Check to see if it worked.
  38.  
  39. for (j in seq(docs)) {
  40. docs[[j]] <- gsub("/", " ", docs[[j]])
  41. docs[[j]] <- gsub("@", " ", docs[[j]])
  42. docs[[j]] <- gsub("\\|", " ", docs[[j]])
  43. docs[[j]] <- gsub("\u2028", " ", docs[[j]]) # This is an ascii character that did not translate, so it had to be removed.
  44. }
  45. ```
  46.  
  47. Remove numbers
  48. ```{r}
  49. docs <- tm_map(docs, removeNumbers)
  50. writeLines(as.character(docs[1])) # Check to see if it worked.
  51. ```
  52.  
  53. Change letter to lower case
  54. ```{r}
  55. docs <- tm_map(docs, tolower)
  56. docs <- tm_map(docs, PlainTextDocument)
  57. DocsCopy <- docs
  58. writeLines(as.character(docs[1])) # Check to see if it worked.
  59. ```
  60.  
  61. Remove words
  62. ```{r}
  63. docs <- tm_map(docs, removeWords, stopwords("english"))
  64. docs <- tm_map(docs, PlainTextDocument)
  65. writeLines(as.character(docs[1])) # Check to see if it worked.
  66.  
  67. docs <- tm_map(docs, removeWords, c("syllogism", "tautology"))
  68.  
  69. for (j in seq(docs))
  70. {
  71. docs[[j]] <- gsub("fake news", "fake_news", docs[[j]])
  72. docs[[j]] <- gsub("inner city", "inner-city", docs[[j]])
  73. docs[[j]] <- gsub("politically correct", "politically_correct", docs[[j]])
  74. }
  75. ```
  76.  
  77. Generate map
  78. ```{r}
  79. docs <- tm_map(docs, PlainTextDocument)
  80.  
  81. docs_st <- tm_map(docs, stemDocument)
  82. docs_st <- tm_map(docs_st, PlainTextDocument)
  83. writeLines(as.character(docs_st[1])) # Check to see if it worked.
  84.  
  85. docs_stc <- tm_map(docs_st, stemCompletion, dictionary = DocsCopy, lazy=TRUE)
  86. docs_stc <- tm_map(docs_stc, PlainTextDocument)
  87. writeLines(as.character(docs_stc[1])) # Check to see if it worked.
  88.  
  89. docs <- tm_map(docs, stripWhitespace)
  90. writeLines(as.character(docs[1])) # Check to see if it worked.
  91.  
  92. docs <- tm_map(docs, PlainTextDocument)
  93. ```
  94.  
  95.  
  96. Generate matrix
  97. ```{r}
  98. dtm <- DocumentTermMatrix(docs)
  99. dtm
  100.  
  101. tdm <- TermDocumentMatrix(docs)
  102. tdm
  103.  
  104. freq <- colSums(as.matrix(dtm))
  105. length(freq)
  106.  
  107. ord <- order(freq)
  108.  
  109. m <- as.matrix(dtm)
  110. dim(m)
  111. ```
  112.  
  113. Save to file
  114.  
  115. ```{r}
  116. write.csv(m, file="DocumentTermMatrix.csv")
  117.  
  118. dtms <- removeSparseTerms(dtm, 0.2)
  119. dtms
  120.  
  121. freq <- colSums(as.matrix(dtm))
  122.  
  123. tail(table(freq), 20)
  124.  
  125. freq <- colSums(as.matrix(dtms))
  126. freq
  127.  
  128. freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
  129. head(freq, 14)
  130.  
  131. findFreqTerms(dtm, lowfreq=50)
  132.  
  133. wf <- data.frame(word=names(freq), freq=freq)
  134. head(wf)
  135. ```
  136.  
  137. Generate plot
  138. ```{r}
  139. library(ggplot2)
  140.  
  141.  
  142. p <- ggplot(subset(wf, freq>50), aes(x = reorder(word, -freq), y = freq)) +
  143. geom_bar(stat = "identity") +
  144. theme(axis.text.x=element_text(angle=45, hjust=1))
  145. p
  146. ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement