Guest User

Untitled

a guest
Jul 21st, 2018
103
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.72 KB | None | 0 0
  1. # coding: utf-8
  2.  
  3. # In[1]:
  4.  
  5.  
  6. import tensorflow as tf
  7. import gensim
  8. import numpy as np
  9.  
  10.  
  11. # In[2]:
  12.  
  13.  
  14. #read word2vec pretrained by using gensim
  15. filename_word2vec = "../../../data/GoogleNews-vectors-negative300_unigrams_alphabetic.bin"
  16. model = gensim.models.KeyedVectors.load_word2vec_format(filename_word2vec, binary=True)
  17.  
  18.  
  19. # In[3]:
  20.  
  21.  
  22. # gensim to embedding matrix
  23. vector_dim=300
  24. embedding_matrix = np.zeros((len(model.vocab), vector_dim))
  25. for i in range(len(model.vocab)):
  26. embedding_vector = model[model.index2word[i]]
  27. if embedding_vector is not None:
  28. embedding_matrix[i] = embedding_vector
  29. print embedding_matrix.shape
  30.  
  31.  
  32. # In[4]:
  33.  
  34.  
  35. # feed numpy embedding matrix
  36. saved_embeddings = tf.constant(embedding_matrix)
  37. embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
  38.  
  39.  
  40. # In[5]:
  41.  
  42.  
  43. # normalize embeddings
  44. norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
  45. # norm: 869549 x 1
  46. normalized_embeddings = embedding / norm
  47.  
  48.  
  49. # In[6]:
  50.  
  51.  
  52. # query words to embedding tensors
  53. query = ["woman", "malignant", "cancer", "gastric"]
  54. query_words_ids = [model.vocab[x].index for x in query]
  55. query_words_tf = tf.constant(query_words_ids, dtype=tf.int32)
  56. query_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, query_words_tf)
  57. print query_words_embeddings.shape
  58.  
  59.  
  60. # In[7]:
  61.  
  62.  
  63. # document words to embedding tensors
  64. docs = [["why", "an", "the"], ["ground", "blue", "tree"], ["female", "stomach", "disease"], ["male", "intestine", "pain"], ["main", "system", "world"]]
  65. doc_words_ids = np.array([[model.vocab[x].index for x in doc] for doc in docs])
  66. doc_words_tf = tf.constant(doc_words_ids, dtype=tf.int32)
  67. doc_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, doc_words_tf)
  68. old_shape = doc_words_embeddings.shape
  69. print "old_shape:", old_shape
  70. new_shape = [np.prod(doc_words_embeddings.shape[:2]), doc_words_embeddings.shape[2]]
  71. doc_words_embeddings = tf.reshape(doc_words_embeddings, new_shape)
  72. print "new_shape:", doc_words_embeddings.shape
  73.  
  74.  
  75. # In[8]:
  76.  
  77.  
  78. # get similar words
  79. similarity = tf.matmul(query_words_embeddings, doc_words_embeddings, transpose_b=True)
  80. # similarity: 10 x 869549
  81. print similarity.shape
  82.  
  83.  
  84. # In[9]:
  85.  
  86.  
  87. # initialize tensorflow variables
  88. init = tf.global_variables_initializer()
  89.  
  90.  
  91. # In[10]:
  92.  
  93.  
  94. # run to get similarity of words with ones in query
  95. with tf.Session() as sess:
  96. sess.run(init)
  97. similarity = tf.reshape(similarity, [len(query), len(docs), len(docs[0])])
  98. similarity = tf.reduce_sum(similarity, axis=(0, 2))
  99. sim = similarity.eval()
  100. print sim
  101. print sim.shape
  102.  
  103.  
  104. # In[11]:
  105.  
  106.  
  107. # rank documents fior the given documents:
  108. ranks = (-sim).argsort()
  109. print "ranks:", ranks
  110. print "query:", ' '.join(query)
  111. print "-"*10
  112. print "socre\tdoc"
  113. for r in ranks:
  114. print str(sim[r]) + '\t' + ' '.join(docs[r])
Add Comment
Please, Sign In to add comment