Advertisement
Guest User

Untitled

a guest
Jan 14th, 2019
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 1.14 KB | None | 0 0
  1. library(text2vec)
  2. library(Rtsne)
  3.  
  4. text8_file = "~/text8"
  5. if (!file.exists(text8_file)) {
  6.   download.file("http://mattmahoney.net/dc/text8.zip", "~/text8.zip")
  7.   unzip ("~/text8.zip", files = "text8", exdir = "~/")
  8. }
  9. wiki = readLines(text8_file, n = 1, warn = FALSE)
  10.  
  11. # Create iterator over tokens
  12. tokens <- space_tokenizer(wiki)
  13. # Create vocabulary. Terms will be unigrams (simple words).
  14. it = itoken(tokens, progressbar = FALSE)
  15. vocab <- create_vocabulary(it)
  16. # At least 5 appearances
  17. vocab <- prune_vocabulary(vocab, term_count_min = 5L)
  18. # Use our filtered vocabulary
  19. vectorizer <- vocab_vectorizer(vocab)
  20. # Use window of 5 for context words
  21. tcm <- create_tcm(it, vectorizer, skip_grams_window = 5L)
  22.  
  23. glove = GlobalVectors$new(word_vectors_size = 50, vocabulary = vocab, x_max = 10)
  24. wv_main = glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01)
  25. dim(wv_main)
  26. wv_context = glove$components
  27. dim(wv_context)
  28. # Word vectors add together
  29. word_vectors = wv_main + t(wv_context)
  30.  
  31. cos_sim = sim2(x = word_vectors, y =   word_vectors["data", , drop = FALSE], method = "cosine", norm = "l2")
  32. head(sort(cos_sim[,1], decreasing = TRUE), 10)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement