Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library(text2vec)
- library(Rtsne)
- text8_file = "~/text8"
- if (!file.exists(text8_file)) {
- download.file("http://mattmahoney.net/dc/text8.zip", "~/text8.zip")
- unzip ("~/text8.zip", files = "text8", exdir = "~/")
- }
- wiki = readLines(text8_file, n = 1, warn = FALSE)
- # Create iterator over tokens
- tokens <- space_tokenizer(wiki)
- # Create vocabulary. Terms will be unigrams (simple words).
- it = itoken(tokens, progressbar = FALSE)
- vocab <- create_vocabulary(it)
- # At least 5 appearances
- vocab <- prune_vocabulary(vocab, term_count_min = 5L)
- # Use our filtered vocabulary
- vectorizer <- vocab_vectorizer(vocab)
- # Use window of 5 for context words
- tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
- glove = GlobalVectors$new(word_vectors_size = 50, vocabulary = vocab, x_max = 10)
- wv_main = glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01)
- dim(wv_main)
- wv_context = glove$components
- dim(wv_context)
- # Word vectors add together
- word_vectors = wv_main + t(wv_context)
- cos_sim = sim2(x = word_vectors, y = word_vectors["data", , drop = FALSE], method = "cosine", norm = "l2")
- head(sort(cos_sim[,1], decreasing = TRUE), 10)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement