Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from gensim.models.keyedvectors import KeyedVectors
- from gensim.scripts.glove2word2vec import glove2word2vec
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- from sklearn.manifold import TSNE
- sns.set()
- def get_words(filename="glove/acm_class.txt"):
- words = set()
- for line in open(filename):
- tokens = line.lower().split()
- words.update(tokens)
- return words
- def draw(model = None): # KeyedVectors.load_word2vec_format("glove.6B.50d.1.txt", binary=False)
- "Creates and TSNE model and plots it"
- labels = []
- tokens = []
- selcted_words = get_words("animals.txt")
- selcted_words.update(get_words("clean_color.txt"))
- for word in model.wv.vocab:
- if word in selcted_words:
- tokens.append(model[word])
- labels.append(word)
- tsne_model = TSNE(perplexity=50, n_components=2, init='pca', n_iter=2500, random_state=23)
- new_values = tsne_model.fit_transform(tokens)
- x = []
- y = []
- for value in new_values:
- x.append(value[0])
- y.append(value[1])
- for image_size in [12]:
- for font_size in [8]:
- from adjustText import adjust_text
- fig = plt.figure(figsize=(image_size, image_size))
- plt.scatter(x, y, s=15, c='b', edgecolors=(1, 1, 1, 0))
- texts = []
- for _x, _y, s in zip(x, y, labels):
- texts.append(plt.text(_x, _y, s, size=font_size))
- adjust_text(texts)
- fig.savefig('fig1_' + str(image_size) + '_font_size_' + str(font_size) + ".eps", format='eps', dpi=1000)
- w2v = "GoogleNews-vectors-negative300.bin"
- w2v_vectors = KeyedVectors.load_word2vec_format(w2v, binary=True)
- embeddings = [w2v_vectors]
- fasttext = "cc.en.300.vec"
- # glove2word2vec(fasttext, "tmp_file")
- fasttext_vectors = KeyedVectors.load_word2vec_format(fasttext, binary=False)
- embeddings.append(fasttext_vectors)
- for glove_file in ["glove.6B.300d.txt","glove.6B.200d.txt","glove.6B.100d.txt","glove.6B.50d.txt"]:
- glove2word2vec(glove_file, "tmp_file")
- glove = KeyedVectors.load_word2vec_format("tmp_file", binary=False)
- embeddings.append(glove)
- from functools import reduce
- interaction = lambda x,y:x & y
- interactions = reduce(interaction, [set(embedding.wv.vocab) for embedding in embeddings])
- for embedding in embeddings:
- interactions = interactions & set(embedding.wv.vocab)
- lens = len( interactions)
- def get_norm(a):
- return np.sqrt(np.sum(np.square(a)))
- df_lentgh = []
- df_norms= []
- for embedding in embeddings:
- norms = dict()
- length = dict()
- for word in interactions:
- norms[word] = get_norm(embedding.wv[word])
- length[word] = np.mean(embedding.wv[word])
- df_norms.append(norms)
- df_lentgh.append(length)
- x= pd.DataFrame(df_norms).transpose()
- y= pd.DataFrame(df_lentgh).transpose()
- x.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
- y.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
- print("Methods &"+" & ".join(x.columns)+ "\\\\")
- print("Mean for the vector norm &"+" & ".join([ "%.4f"% x[name].mean() for name in x.columns])+ "\\\\")
- print("var for the vector norm &"+" & ".join(["%.4f"% x[name].var() for name in x.columns])+ "\\\\")
- print("Mean for the vector mean value &"+" & ".join(["%.4f"% y[name].mean() for name in y.columns])+ "\\\\")
- print("Mean for the vector mean value &"+" & ".join(["%.4f"% y[name].var() for name in y.columns]) + "\\\\")
- for name in x.columns:
- print("%.4f & %.4f",x[name].mean())
- print(x[name].var())
- sns.boxplot(data=x)
- sns.boxplot(data=y)
- from draw import draw
- for model in embeddings:
- draw(model)
- for name in x.columns :
- sns.kdeplot(x[name],shade=True)
- for name in y.columns :
- sns.kdeplot(y[name],shade=False)
- words = get_words("animals.txt")
- animals = (words & interactions)
- df_norms= []
- df_lentgh=[]
- for embedding in embeddings:
- norms = dict()
- length = dict()
- for word in animals:
- norms[word] = get_norm(embedding.wv[word])
- length[word] = np.mean(embedding.wv[word])
- df_norms.append(norms)
- df_lentgh.append(length)
- x= pd.DataFrame(df_norms).transpose()
- y= pd.DataFrame(df_lentgh).transpose()
- sns.pairplot(x)
- sns.pairplot(y)
- x.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
- y.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
- from scipy import stats
- for df in [x,y]:
- for func in [stats.pearsonr,stats.spearmanr,stats.chisquare]:
- for a in x.columns:
- cors, pvalues = [],[]
- for b in x.columns:
- c,p = func( df[a],df[b])
- cors.append(c)
- pvalues.append(p)
- numbers = " & ".join( ["%.4f (%.4f)"% (cors[i],pvalues[i]) for i in range(len(cors))] )
- print("%s & %s \\\\" %(a, numbers))
- print("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement