Untitled

cname <- file.path("/home/dos/Desktop/blog/R")
#install.packages("tm")
library(tm)
docs <- VCorpus(DirSource(cname))
docs <- tm_map(docs,removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removeWords, c("shreyas", "waghmare"))
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
dtm <- DocumentTermMatrix(docs)
tdm <- TermDocumentMatrix(docs)
freq <- colSums(as.matrix(dtm))
length(freq)
ord <- order(freq)
m <- as.matrix(dtm)
dim(m)
write.csv(m, file="DocumentTermMatrix.csv")
dtms <- removeSparseTerms(dtm, 0.1)
head(table(freq), 20)
freq
tail(table(freq), 20)
freq
freq <- colSums(as.matrix(dtms))
freq
#install.packages("ggplot2")
library(ggplot2)
wf <- data.frame(word=names(freq), freq=freq)
p <- ggplot(subset(wf, freq>50), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
findAssocs(dtm, c("shreyas" , "waghmare"), corlimit=0.85)
findAssocs(dtms, "think", corlimit=0.70)
#install.packages("wordcloud")
library(wordcloud)
dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)
freq <- colSums(as.matrix(dtm))
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=20, rot.per=0.5, colors=dark2)