Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ---
- title: "Text mining"
- output: html_notebook
- ---
- #https://rpubs.com/pjmurphy/265713
- Install packages
- ```{r}
- Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust",
- "cluster", "igraph", "fpc")
- install.packages(Needed, dependencies = TRUE)
- ```
- Load data
- ```{r}
- install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
- cname <- file.path("C:\\Users\\dstudent\\Documents\\texts")
- cname
- dir(cname)
- ```
- First preprocess data
- ```{r}
- library(tm)
- docs <- VCorpus(DirSource(cname))
- summary(docs)
- inspect(docs[1])
- writeLines(as.character(docs[1]))
- docs <- tm_map(docs,removePunctuation)
- writeLines(as.character(docs[1])) # Check to see if it worked.
- for (j in seq(docs)) {
- docs[[j]] <- gsub("/", " ", docs[[j]])
- docs[[j]] <- gsub("@", " ", docs[[j]])
- docs[[j]] <- gsub("\\|", " ", docs[[j]])
- docs[[j]] <- gsub("\u2028", " ", docs[[j]]) # This is an ascii character that did not translate, so it had to be removed.
- }
- ```
- Remove numbers
- ```{r}
- docs <- tm_map(docs, removeNumbers)
- writeLines(as.character(docs[1])) # Check to see if it worked.
- ```
- Change letter to lower case
- ```{r}
- docs <- tm_map(docs, tolower)
- docs <- tm_map(docs, PlainTextDocument)
- DocsCopy <- docs
- writeLines(as.character(docs[1])) # Check to see if it worked.
- ```
- Remove words
- ```{r}
- docs <- tm_map(docs, removeWords, stopwords("english"))
- docs <- tm_map(docs, PlainTextDocument)
- writeLines(as.character(docs[1])) # Check to see if it worked.
- docs <- tm_map(docs, removeWords, c("syllogism", "tautology"))
- for (j in seq(docs))
- {
- docs[[j]] <- gsub("fake news", "fake_news", docs[[j]])
- docs[[j]] <- gsub("inner city", "inner-city", docs[[j]])
- docs[[j]] <- gsub("politically correct", "politically_correct", docs[[j]])
- }
- ```
- Generate map
- ```{r}
- docs <- tm_map(docs, PlainTextDocument)
- docs_st <- tm_map(docs, stemDocument)
- docs_st <- tm_map(docs_st, PlainTextDocument)
- writeLines(as.character(docs_st[1])) # Check to see if it worked.
- docs_stc <- tm_map(docs_st, stemCompletion, dictionary = DocsCopy, lazy=TRUE)
- docs_stc <- tm_map(docs_stc, PlainTextDocument)
- writeLines(as.character(docs_stc[1])) # Check to see if it worked.
- docs <- tm_map(docs, stripWhitespace)
- writeLines(as.character(docs[1])) # Check to see if it worked.
- docs <- tm_map(docs, PlainTextDocument)
- ```
- Generate matrix
- ```{r}
- dtm <- DocumentTermMatrix(docs)
- dtm
- tdm <- TermDocumentMatrix(docs)
- tdm
- freq <- colSums(as.matrix(dtm))
- length(freq)
- ord <- order(freq)
- m <- as.matrix(dtm)
- dim(m)
- ```
- Save to file
- ```{r}
- write.csv(m, file="DocumentTermMatrix.csv")
- dtms <- removeSparseTerms(dtm, 0.2)
- dtms
- freq <- colSums(as.matrix(dtm))
- tail(table(freq), 20)
- freq <- colSums(as.matrix(dtms))
- freq
- freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
- head(freq, 14)
- findFreqTerms(dtm, lowfreq=50)
- wf <- data.frame(word=names(freq), freq=freq)
- head(wf)
- ```
- Generate plot
- ```{r}
- library(ggplot2)
- p <- ggplot(subset(wf, freq>50), aes(x = reorder(word, -freq), y = freq)) +
- geom_bar(stat = "identity") +
- theme(axis.text.x=element_text(angle=45, hjust=1))
- p
- ```
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement