Advertisement
Guest User

Untitled

a guest
Oct 18th, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.83 KB | None | 0 0
  1. import numpy as np
  2. from gensim.models.keyedvectors import KeyedVectors
  3. from gensim.scripts.glove2word2vec import glove2word2vec
  4. import pandas as pd
  5. import seaborn as sns
  6. import matplotlib.pyplot as plt
  7. from sklearn.manifold import TSNE
  8. sns.set()
  9.  
  10.  
  11. def get_words(filename="glove/acm_class.txt"):
  12. words = set()
  13. for line in open(filename):
  14. tokens = line.lower().split()
  15. words.update(tokens)
  16. return words
  17. def draw(model = None): # KeyedVectors.load_word2vec_format("glove.6B.50d.1.txt", binary=False)
  18. "Creates and TSNE model and plots it"
  19. labels = []
  20. tokens = []
  21.  
  22. selcted_words = get_words("animals.txt")
  23. selcted_words.update(get_words("clean_color.txt"))
  24.  
  25. for word in model.wv.vocab:
  26. if word in selcted_words:
  27. tokens.append(model[word])
  28. labels.append(word)
  29.  
  30. tsne_model = TSNE(perplexity=50, n_components=2, init='pca', n_iter=2500, random_state=23)
  31. new_values = tsne_model.fit_transform(tokens)
  32.  
  33. x = []
  34. y = []
  35. for value in new_values:
  36. x.append(value[0])
  37. y.append(value[1])
  38. for image_size in [12]:
  39. for font_size in [8]:
  40. from adjustText import adjust_text
  41.  
  42. fig = plt.figure(figsize=(image_size, image_size))
  43. plt.scatter(x, y, s=15, c='b', edgecolors=(1, 1, 1, 0))
  44.  
  45. texts = []
  46. for _x, _y, s in zip(x, y, labels):
  47. texts.append(plt.text(_x, _y, s, size=font_size))
  48.  
  49. adjust_text(texts)
  50. fig.savefig('fig1_' + str(image_size) + '_font_size_' + str(font_size) + ".eps", format='eps', dpi=1000)
  51.  
  52.  
  53. w2v = "GoogleNews-vectors-negative300.bin"
  54. w2v_vectors = KeyedVectors.load_word2vec_format(w2v, binary=True)
  55. embeddings = [w2v_vectors]
  56.  
  57. fasttext = "cc.en.300.vec"
  58. # glove2word2vec(fasttext, "tmp_file")
  59. fasttext_vectors = KeyedVectors.load_word2vec_format(fasttext, binary=False)
  60. embeddings.append(fasttext_vectors)
  61.  
  62. for glove_file in ["glove.6B.300d.txt","glove.6B.200d.txt","glove.6B.100d.txt","glove.6B.50d.txt"]:
  63. glove2word2vec(glove_file, "tmp_file")
  64. glove = KeyedVectors.load_word2vec_format("tmp_file", binary=False)
  65. embeddings.append(glove)
  66.  
  67. from functools import reduce
  68. interaction = lambda x,y:x & y
  69. interactions = reduce(interaction, [set(embedding.wv.vocab) for embedding in embeddings])
  70. for embedding in embeddings:
  71. interactions = interactions & set(embedding.wv.vocab)
  72.  
  73. lens = len( interactions)
  74.  
  75. def get_norm(a):
  76. return np.sqrt(np.sum(np.square(a)))
  77.  
  78. df_lentgh = []
  79. df_norms= []
  80. for embedding in embeddings:
  81. norms = dict()
  82. length = dict()
  83. for word in interactions:
  84. norms[word] = get_norm(embedding.wv[word])
  85. length[word] = np.mean(embedding.wv[word])
  86. df_norms.append(norms)
  87. df_lentgh.append(length)
  88. x= pd.DataFrame(df_norms).transpose()
  89. y= pd.DataFrame(df_lentgh).transpose()
  90.  
  91. x.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
  92. y.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
  93.  
  94.  
  95. print("Methods &"+" & ".join(x.columns)+ "\\\\")
  96. print("Mean for the vector norm &"+" & ".join([ "%.4f"% x[name].mean() for name in x.columns])+ "\\\\")
  97. print("var for the vector norm &"+" & ".join(["%.4f"% x[name].var() for name in x.columns])+ "\\\\")
  98. print("Mean for the vector mean value &"+" & ".join(["%.4f"% y[name].mean() for name in y.columns])+ "\\\\")
  99. print("Mean for the vector mean value &"+" & ".join(["%.4f"% y[name].var() for name in y.columns]) + "\\\\")
  100. for name in x.columns:
  101. print("%.4f & %.4f",x[name].mean())
  102. print(x[name].var())
  103.  
  104. sns.boxplot(data=x)
  105. sns.boxplot(data=y)
  106.  
  107. from draw import draw
  108. for model in embeddings:
  109. draw(model)
  110.  
  111. for name in x.columns :
  112. sns.kdeplot(x[name],shade=True)
  113.  
  114.  
  115. for name in y.columns :
  116. sns.kdeplot(y[name],shade=False)
  117.  
  118.  
  119.  
  120. words = get_words("animals.txt")
  121.  
  122. animals = (words & interactions)
  123. df_norms= []
  124. df_lentgh=[]
  125. for embedding in embeddings:
  126. norms = dict()
  127. length = dict()
  128. for word in animals:
  129. norms[word] = get_norm(embedding.wv[word])
  130. length[word] = np.mean(embedding.wv[word])
  131. df_norms.append(norms)
  132. df_lentgh.append(length)
  133. x= pd.DataFrame(df_norms).transpose()
  134. y= pd.DataFrame(df_lentgh).transpose()
  135.  
  136. sns.pairplot(x)
  137. sns.pairplot(y)
  138.  
  139. x.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
  140. y.columns = ["word2vec","fasttext","glove-300d","glove-200d","glove-100d","glove-50d"]
  141. from scipy import stats
  142. for df in [x,y]:
  143. for func in [stats.pearsonr,stats.spearmanr,stats.chisquare]:
  144. for a in x.columns:
  145. cors, pvalues = [],[]
  146. for b in x.columns:
  147. c,p = func( df[a],df[b])
  148. cors.append(c)
  149. pvalues.append(p)
  150. numbers = " & ".join( ["%.4f (%.4f)"% (cors[i],pvalues[i]) for i in range(len(cors))] )
  151. print("%s & %s \\\\" %(a, numbers))
  152. print("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement