Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- # In[1]:
- import tensorflow as tf
- import gensim
- import numpy as np
- # In[2]:
- #read word2vec pretrained by using gensim
- filename_word2vec = "../../../data/GoogleNews-vectors-negative300_unigrams_alphabetic.bin"
- model = gensim.models.KeyedVectors.load_word2vec_format(filename_word2vec, binary=True)
- # In[3]:
- # gensim to embedding matrix
- vector_dim=300
- embedding_matrix = np.zeros((len(model.vocab), vector_dim))
- for i in range(len(model.vocab)):
- embedding_vector = model[model.index2word[i]]
- if embedding_vector is not None:
- embedding_matrix[i] = embedding_vector
- print embedding_matrix.shape
- # In[4]:
- # feed numpy embedding matrix
- saved_embeddings = tf.constant(embedding_matrix)
- embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
- # In[5]:
- # normalize embeddings
- norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
- # norm: 869549 x 1
- normalized_embeddings = embedding / norm
- # In[6]:
- # query words to embedding tensors
- query = ["woman", "malignant", "cancer", "gastric"]
- query_words_ids = [model.vocab[x].index for x in query]
- query_words_tf = tf.constant(query_words_ids, dtype=tf.int32)
- query_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, query_words_tf)
- print query_words_embeddings.shape
- # In[7]:
- # document words to embedding tensors
- docs = [["why", "an", "the"], ["ground", "blue", "tree"], ["female", "stomach", "disease"], ["male", "intestine", "pain"], ["main", "system", "world"]]
- doc_words_ids = np.array([[model.vocab[x].index for x in doc] for doc in docs])
- doc_words_tf = tf.constant(doc_words_ids, dtype=tf.int32)
- doc_words_embeddings = tf.nn.embedding_lookup(normalized_embeddings, doc_words_tf)
- old_shape = doc_words_embeddings.shape
- print "old_shape:", old_shape
- new_shape = [np.prod(doc_words_embeddings.shape[:2]), doc_words_embeddings.shape[2]]
- doc_words_embeddings = tf.reshape(doc_words_embeddings, new_shape)
- print "new_shape:", doc_words_embeddings.shape
- # In[8]:
- # get similar words
- similarity = tf.matmul(query_words_embeddings, doc_words_embeddings, transpose_b=True)
- # similarity: 10 x 869549
- print similarity.shape
- # In[9]:
- # initialize tensorflow variables
- init = tf.global_variables_initializer()
- # In[10]:
- # run to get similarity of words with ones in query
- with tf.Session() as sess:
- sess.run(init)
- similarity = tf.reshape(similarity, [len(query), len(docs), len(docs[0])])
- similarity = tf.reduce_sum(similarity, axis=(0, 2))
- sim = similarity.eval()
- print sim
- print sim.shape
- # In[11]:
- # rank documents fior the given documents:
- ranks = (-sim).argsort()
- print "ranks:", ranks
- print "query:", ' '.join(query)
- print "-"*10
- print "socre\tdoc"
- for r in ranks:
- print str(sim[r]) + '\t' + ' '.join(docs[r])
Add Comment
Please, Sign In to add comment