Guest User

Untitled

a guest
Nov 15th, 2018
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.45 KB | None | 0 0
  1. # *** Text Pre-Processing with Quanteda ***
  2. # 1. Tokenization
  3. text.tokens <- tokens(docs$text, what = 'word',
  4. remove_numbers = TRUE,
  5. remove_punct = TRUE,
  6. remove_symbols = TRUE,
  7. remove_hyphens = TRUE)
  8.  
  9. # 2. Transform words to lower case
  10. text.tokens <- tokens_tolower(text.tokens)
  11.  
  12. # 3. Removing stop-words (Using quanteda's built-in stopwords list)
  13. text.tokens <- tokens_select(text.tokens, stopwords(),
  14. selection = 'remove')
  15. # 4. Perform stemming on the tokens.
  16. text.tokens <- tokens_wordstem(text.tokens, language = 'english')
  17.  
  18. # 5. Create bag-of-words model / document feature(frequance)
  19. text.tokens.dfm <- dfm(text.tokens, tolower = FALSE)
  20.  
  21. # 6. Transform to a matrix to work with and inspect
  22. text.tokens.matrix <- as.matrix(text.tokens.dfm)
  23. dim(text.tokens.matrix)
  24.  
  25. # *** Doing TF-IDF ***
  26. # Defining Function for calculating relative term frequency (TF)
  27. term.frequency <- function(row) {
  28. row / sum(row)
  29. }
  30. # Defining Function for calculating inverse document frequency (IDF)
  31. inverse.doc.freq <- function(col) {
  32. corpus.size <- length(col)
  33. doc.count <- length(which(col > 0))
  34.  
  35. log10(corpus.size / doc.count)
  36. }
  37. # Defining function for calculating TD-IDF
  38. tf.idf <- function(tf, idf) {
  39. tf * idf
  40. }
  41.  
  42. # 1. First step, normalize all documents via TF.
  43. text.tokens.df <- apply(text.tokens.matrix, 1, term.frequency)
  44. dim(text.tokens.df)
  45.  
  46. # 2. Second step, calculate the IDF vector
  47. text.tokens.idf <- apply(text.tokens.matrix, 2, inverse.doc.freq)
  48. str(text.tokens.idf)
  49.  
  50. # 3. Lastly, calculate TF-IDF for our corpus
  51. # Apply function on columns, because matrix is transposed from TF function
  52. text.tokens.tfidf <- apply(text.tokens.df, 2, tf.idf, idf = text.tokens.idf)
  53. dim(text.tokens.tfidf)
  54.  
  55. # Now, transpose the matrix back
  56. text.tokens.tfidf <- t(text.tokens.tfidf)
  57. dim(text.tokens.tfidf)
  58.  
  59. # Cosine similarity using Text2Vec
  60. similarity.sim2 <- sim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")
  61.  
  62. similarity.psim2 <- psim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")
  63. similarity.psim2 <- as.data.frame(similarity.psim2)
Add Comment
Please, Sign In to add comment