Untitled

---
title: "Text mining"
output: html_notebook
---

#https://rpubs.com/pjmurphy/265713

Install packages
```{r}
Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust",
            "cluster", "igraph", "fpc")
install.packages(Needed, dependencies = TRUE)
```

Load data

```{r}
install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")

cname <- file.path("C:\\Users\\dstudent\\Documents\\texts")
cname
dir(cname)
```

First preprocess data
```{r}
library(tm)

docs <- VCorpus(DirSource(cname))
summary(docs)

inspect(docs[1])

writeLines(as.character(docs[1]))
docs <- tm_map(docs,removePunctuation)

writeLines(as.character(docs[1])) # Check to see if it worked.

for (j in seq(docs)) {
  docs[[j]] <- gsub("/", " ", docs[[j]])
  docs[[j]] <- gsub("@", " ", docs[[j]])
  docs[[j]] <- gsub("\\|", " ", docs[[j]])
  docs[[j]] <- gsub("\u2028", " ", docs[[j]])  # This is an ascii character that did not translate, so it had to be removed.
}
```

Remove numbers
```{r}
docs <- tm_map(docs, removeNumbers)
writeLines(as.character(docs[1])) # Check to see if it worked.
```

Change letter to lower case
```{r}
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, PlainTextDocument)
DocsCopy <- docs
writeLines(as.character(docs[1])) # Check to see if it worked.
```

Remove words
```{r}
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, PlainTextDocument)
writeLines(as.character(docs[1])) # Check to see if it worked.

docs <- tm_map(docs, removeWords, c("syllogism", "tautology"))

for (j in seq(docs))
{
  docs[[j]] <- gsub("fake news", "fake_news", docs[[j]])
  docs[[j]] <- gsub("inner city", "inner-city", docs[[j]])
  docs[[j]] <- gsub("politically correct", "politically_correct", docs[[j]])
}
```

Generate map
```{r}
docs <- tm_map(docs, PlainTextDocument)

docs_st <- tm_map(docs, stemDocument)
docs_st <- tm_map(docs_st, PlainTextDocument)
writeLines(as.character(docs_st[1])) # Check to see if it worked.

docs_stc <- tm_map(docs_st, stemCompletion, dictionary = DocsCopy, lazy=TRUE)
docs_stc <- tm_map(docs_stc, PlainTextDocument)
writeLines(as.character(docs_stc[1])) # Check to see if it worked.

docs <- tm_map(docs, stripWhitespace)
writeLines(as.character(docs[1])) # Check to see if it worked.

docs <- tm_map(docs, PlainTextDocument)
```


Generate matrix
```{r}
dtm <- DocumentTermMatrix(docs)
dtm

tdm <- TermDocumentMatrix(docs)
tdm

freq <- colSums(as.matrix(dtm))
length(freq)

ord <- order(freq)

m <- as.matrix(dtm)
dim(m)
```

Save to file

```{r}
write.csv(m, file="DocumentTermMatrix.csv")

dtms <- removeSparseTerms(dtm, 0.2)
dtms

freq <- colSums(as.matrix(dtm))

tail(table(freq), 20)

freq <- colSums(as.matrix(dtms))
freq

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq, 14)

findFreqTerms(dtm, lowfreq=50)

wf <- data.frame(word=names(freq), freq=freq)
head(wf)
```

Generate plot
```{r}
library(ggplot2)


p <- ggplot(subset(wf, freq>50), aes(x = reorder(word, -freq), y = freq)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=45, hjust=1))
p
```