Advertisement
swchen

R_text_mining_chinese

Aug 5th, 2015
591
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.14 KB | None | 0 0
  1. demo=read.csv("demo.csv")
  2. corpus=Corpus(VectorSource(demo))
  3. corpus <- tm_map(corpus, removePunctuation)
  4. corpus <- tm_map(corpus,removeNumbers)    
  5. corpus <- tm_map(corpus, stripWhitespace)
  6. corpus <- tm_map(corpus,function(word) {gsub("[A-Za-z0-9]","",word)})
  7. corpus <- tm_map(corpus, segmentCN, nature = TRUE)
  8.  
  9. #demo_source=VectorSource(demo)
  10. stopwords=c(stopwordsCN(c("問題","請問","你好","您好","妳好")),stopwords("english"),"\n","")
  11. corpus=tm_map(corpus,removeWords,stopwords)
  12. strwrap(corpus[[3]],width=100)
  13.  
  14.  
  15. corpus=tm_map(corpus,PlainTextDocument) #把資料轉成PlainTextDocument形式
  16. tdm=TermDocumentMatrix(corpus,control=list(wordLengths=c(2,Inf)))
  17.  
  18. m1 <- as.matrix(tdm)
  19. v <- sort(rowSums(m1), decreasing = TRUE)
  20. d <- data.frame(word = names(v), freq = v)
  21. wordcloud(d$word, d$freq, min.freq = 2, random.order = F, ordered.colors = F,
  22.           colors = rainbow(length(row.names(m1))))
  23. head(v,10)
  24. #結果如下
  25. #     嗎\n     現貨       謝謝      購買    嗎\n謝謝    顆\n     全新     老闆     硬碟     還\n
  26. #      15       10        8        8          4        4        3        3        3        3
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement