Untitled

# *** Text Pre-Processing with Quanteda ***
      # 1. Tokenization
      text.tokens <- tokens(docs$text, what = 'word',
                          remove_numbers = TRUE,
                          remove_punct = TRUE,
                          remove_symbols = TRUE,
                          remove_hyphens = TRUE)

      # 2. Transform words to lower case
      text.tokens <- tokens_tolower(text.tokens)

      # 3. Removing stop-words (Using quanteda's built-in stopwords list)
      text.tokens <- tokens_select(text.tokens, stopwords(),
                                   selection = 'remove')
      # 4. Perform stemming on the tokens.
      text.tokens <- tokens_wordstem(text.tokens, language = 'english')

      # 5. Create bag-of-words model / document feature(frequance)
      text.tokens.dfm <- dfm(text.tokens, tolower = FALSE)

      # 6. Transform to a matrix to work with and inspect
      text.tokens.matrix <- as.matrix(text.tokens.dfm)
      dim(text.tokens.matrix)

    # *** Doing TF-IDF ***
      # Defining Function for calculating relative term frequency (TF)
      term.frequency <- function(row) {
        row / sum(row)
      }
      # Defining Function for calculating inverse document frequency (IDF)
      inverse.doc.freq <- function(col) {
        corpus.size <- length(col)
        doc.count <- length(which(col > 0))

        log10(corpus.size / doc.count)
      }
      # Defining function for calculating TD-IDF
      tf.idf <- function(tf, idf) {
        tf * idf
      }

      # 1. First step, normalize all documents via TF.
      text.tokens.df <- apply(text.tokens.matrix, 1, term.frequency)
      dim(text.tokens.df)

      # 2. Second step, calculate the IDF vector
      text.tokens.idf <- apply(text.tokens.matrix, 2, inverse.doc.freq)
      str(text.tokens.idf)

      # 3. Lastly, calculate TF-IDF for our corpus
        # Apply function on columns, because matrix is transposed from TF function
        text.tokens.tfidf <- apply(text.tokens.df, 2, tf.idf, idf = text.tokens.idf)
        dim(text.tokens.tfidf)

      # Now, transpose the matrix back
        text.tokens.tfidf <- t(text.tokens.tfidf)
        dim(text.tokens.tfidf)

     # Cosine similarity using Text2Vec
  similarity.sim2 <- sim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")

  similarity.psim2 <- psim2(text.tokensChina.tfidf, text.tokensChina.tfidf, method = "cosine", norm = "none")
  similarity.psim2 <- as.data.frame(similarity.psim2)