Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # *** Text Pre-Processing with Quanteda ***
- # 1. Tokenization
- text.tokens <- tokens(docs$text, what = 'word',
- remove_numbers = TRUE,
- remove_punct = TRUE,
- remove_symbols = TRUE,
- remove_hyphens = TRUE)
- # 2. Transform words to lower case
- text.tokens <- tokens_tolower(text.tokens)
- # 3. Removing stop-words (Using quanteda's built-in stopwords list)
- text.tokens <- tokens_select(text.tokens, stopwords(),
- selection = 'remove')
- # 4. Perform stemming on the tokens.
- text.tokens <- tokens_wordstem(text.tokens, language = 'english')
- # 5. Create bag-of-words model / document feature(frequance)
- text.tokens.dfm <- dfm(text.tokens, tolower = FALSE)
- # 6. Transform to a matrix to work with and inspect
- text.tokens.matrix <- as.matrix(text.tokens.dfm)
- dim(text.tokens.matrix)
- # *** Doing TF-IDF ***
- # Defining Function for calculating relative term frequency (TF)
- term.frequency <- function(row) {
- row / sum(row)
- }
- # Defining Function for calculating inverse document frequency (IDF)
- inverse.doc.freq <- function(col) {
- corpus.size <- length(col)
- doc.count <- length(which(col > 0))
- log10(corpus.size / doc.count)
- }
- # Defining function for calculating TD-IDF
- tf.idf <- function(tf, idf) {
- tf * idf
- }
- # 1. First step, normalize all documents via TF.
- text.tokens.df <- apply(text.tokens.matrix, 1, term.frequency)
- dim(text.tokens.df)
- # 2. Second step, calculate the IDF vector
- text.tokens.idf <- apply(text.tokens.matrix, 2, inverse.doc.freq)
- str(text.tokens.idf)
- # 3. Lastly, calculate TF-IDF for our corpus
- # Apply function on columns, because matrix is transposed from TF function
- text.tokens.tfidf <- apply(text.tokens.df, 2, tf.idf, idf = text.tokens.idf)
- dim(text.tokens.tfidf)
- # Now, transpose the matrix back
- text.tokens.tfidf <- t(text.tokens.tfidf)
- dim(text.tokens.tfidf)
- # Cosine similarity using Text2Vec
- similarity.sim2 <- sim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")
- similarity.psim2 <- psim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")
- similarity.psim2 <- as.data.frame(similarity.psim2)
Add Comment
Please, Sign In to add comment