R_text_mining_chinese

demo=read.csv("demo.csv")
corpus=Corpus(VectorSource(demo))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus,removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus,function(word) {gsub("[A-Za-z0-9]","",word)})
corpus <- tm_map(corpus, segmentCN, nature = TRUE)

#demo_source=VectorSource(demo)
stopwords=c(stopwordsCN(c("問題","請問","你好","您好","妳好")),stopwords("english"),"\n","")
corpus=tm_map(corpus,removeWords,stopwords)
strwrap(corpus[[3]],width=100)


corpus=tm_map(corpus,PlainTextDocument) #把資料轉成PlainTextDocument形式
tdm=TermDocumentMatrix(corpus,control=list(wordLengths=c(2,Inf)))

m1 <- as.matrix(tdm)
v <- sort(rowSums(m1), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, min.freq = 2, random.order = F, ordered.colors = F,
          colors = rainbow(length(row.names(m1))))
head(v,10)
#結果如下
#     嗎\n     現貨       謝謝      購買    嗎\n謝謝    顆\n     全新     老闆     硬碟     還\n
#      15       10        8        8          4        4        3        3        3        3