Untitled

# coding: utf-8

# In[1]:


import tensorflow as tf
import gensim
import numpy as np


# In[2]:


#read word2vec pretrained by using gensim
filename_word2vec = "../../../data/GoogleNews-vectors-negative300_unigrams_alphabetic.bin"
model = gensim.models.KeyedVectors.load_word2vec_format(filename_word2vec, binary=True)


# In[3]:


# gensim to embedding matrix
vector_dim=300
embedding_matrix = np.zeros((len(model.vocab), vector_dim))
for i in range(len(model.vocab)):
    embedding_vector = model[model.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print embedding_matrix.shape


# In[4]:


# feed numpy embedding matrix
saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)


# In[5]:


# normalize embeddings
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
# norm: 869549 x 1
normalized_embeddings = embedding / norm


# In[6]:


# query words to embedding tensors
query = ["woman", "malignant", "cancer", "gastric"]
query_words_ids = [model.vocab[x].index for x in query]
query_words_tf = tf.constant(query_words_ids, dtype=tf.int32)
query_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, query_words_tf)
print query_words_embeddings.shape


# In[7]:


# document words to embedding tensors
docs = [["why", "an", "the"], ["ground", "blue", "tree"], ["female", "stomach", "disease"], ["male", "intestine", "pain"], ["main", "system", "world"]]
doc_words_ids = np.array([[model.vocab[x].index for x in doc] for doc in docs])
doc_words_tf = tf.constant(doc_words_ids, dtype=tf.int32)
doc_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, doc_words_tf)
old_shape = doc_words_embeddings.shape
print "old_shape:", old_shape
new_shape = [np.prod(doc_words_embeddings.shape[:2]), doc_words_embeddings.shape[2]]
doc_words_embeddings = tf.reshape(doc_words_embeddings, new_shape)
print "new_shape:", doc_words_embeddings.shape


# In[8]:


# get similar words
similarity = tf.matmul(query_words_embeddings, doc_words_embeddings, transpose_b=True)
# similarity: 10 x 869549
print similarity.shape


# In[9]:


# initialize tensorflow variables
init = tf.global_variables_initializer()


# In[10]:


# run to get similarity of words with ones in query
with tf.Session() as sess:
    sess.run(init)
    similarity = tf.reshape(similarity, [len(query), len(docs), len(docs[0])])
    similarity = tf.reduce_sum(similarity, axis=(0, 2))
    sim = similarity.eval()
print sim
print sim.shape


# In[11]:


# rank documents fior the given documents:
ranks = (-sim).argsort()
print "ranks:", ranks
print "query:", ' '.join(query)
print "-"*10
print "socre\tdoc"
for r in ranks:
    print str(sim[r]) + '\t' + ' '.join(docs[r])