SHARE
TWEET

Untitled

a guest Dec 10th, 2019 70 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import gensim
  2.  
  3. from gensim.models import KeyedVectors
  4. # load the google word2vec model
  5. filename = 'C:\\Users\\Миша\\Downloads\\GoogleNews-vectors-negative300.bin.gz'
  6. model = KeyedVectors.load_word2vec_format(filename, binary=True)
  7. # calculate: (king - man) + woman = ?
  8. result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
  9. print(result)
  10.  
  11. import numpy as np
  12.  
  13. word2vecdict = {}
  14. for word in model.vocab:
  15.     word2vecdict[word] = np.array(model[word])
  16. print(word2vecdict["king"])
  17.  
  18. with open("word2vecdict.pkl", "wb") as out_file:
  19.     pickle.dump(word2vecdict, out_file, protocol=pickle.HIGHEST_PROTOCOL)
  20.  
  21. import pickle
  22.  
  23. word2phonesvec = {}
  24. with open("C:\\Users\\Миша\\Desktop\\word2phones.pkl" , "rb") as file:
  25.     data = pickle.load(file)
  26. for word in data.keys():
  27.     if word in word2vecdict.keys():
  28.         word2phonesvec[word] = (data[word], word2vecdict[word])
  29. print(word2phonesvec["is"])
  30.  
  31. adj_vec = {}
  32. temp_word = []
  33. temp_vec = []
  34. for adj in adj_dic.keys():
  35.     temp_word = adj_dic[adj]
  36.     for word in temp_word:
  37.         try:
  38.             temp_vec.append(word2vecdict[word])
  39.         except Exception:
  40.             pass
  41.     adj_vec[adj] = np.sum(temp_vec, axis = 0)
  42. adj_vec
  43.  
  44. opposite_dict = {}
  45. opposite_dict["good_bad"] = adj_vec["Good"] - adj_vec["Bad"]
  46. opposite_dict["angular_round"] = adj_vec["Angular"] - adj_vec["Round"]
  47. opposite_dict["cold_hot"] = adj_vec["Cold"] - adj_vec["Hot"]
  48. opposite_dict["complex_simple"] = adj_vec["Complex"] - adj_vec["Simple"]
  49. opposite_dict["cowardly_brave"] = adj_vec["Cowardly"] - adj_vec["Brave"]
  50. opposite_dict["dangerous_secure"] = adj_vec["Dangerous"] - adj_vec["Secure"]
  51. opposite_dict["luminous_dark"] = adj_vec["Luminous"] - adj_vec["Dark"]
  52. opposite_dict["difficult_easy"] = adj_vec["Difficult"] - adj_vec["Easy"]
  53. opposite_dict["benign_evil"] = adj_vec["Benign"] - adj_vec["Evil"]
  54. opposite_dict["bright_faded"] = adj_vec["Bright"] - adj_vec["Faded"]
  55. opposite_dict["strong_feeble"] = adj_vec["Strong"] - adj_vec["Feeble"]
  56. opposite_dict["masculine_feminine"] = adj_vec["Masculine"] - adj_vec["Feminine"]
  57. opposite_dict["passive_active"] = adj_vec["Passive"] - adj_vec["Active"]
  58. opposite_dict["quiet_loud"] = adj_vec["Quiet"] - adj_vec["Loud"]
  59. opposite_dict["rough_tender"] = adj_vec["Rough"] - adj_vec["Tender"]
  60. opposite_dict["joyful_sad"] = adj_vec["Joyful"] - adj_vec["Sad"]
  61. opposite_dict["short_long"] = adj_vec["Short"] - adj_vec["Long"]
  62. opposite_dict["quick_slow"] = adj_vec["Quick"] - adj_vec["Slow"]
  63. opposite_dict["small_big"] = adj_vec["Small"] - adj_vec["Big"]
  64. opposite_dict["rough_or_not_smooth_smooth"] = adj_vec["Rough_or_not_smooth"] - adj_vec["Smooth"]
  65. opposite_dict["beautiful_ugly"] = adj_vec["Beautiful"] - adj_vec["Ugly"]
  66. opposite_dict
  67.  
  68. with open("sounds_in_words_dict.pkl" , "rb") as file:
  69.     data = pickle.load(file)
  70. data
  71.  
  72. def GetCos(v1 , v2):
  73.     m = np.dot(v1, v2)
  74.     return m
  75.  
  76. projection_dict = {}
  77. for ph in data.keys():
  78.     projection_dict[ph] = {}
  79.     for opp in opposite_dict.keys():
  80.         axes = opposite_dict[opp]
  81.         axes = axes / np.linalg.norm(axes)
  82.         axes_distributions = []
  83.         for arr in data[ph]:
  84.             distribution_list = []
  85.             for word in arr:
  86.                 try:
  87.                     word_vector = np.array(list(word2vecdict[word]))
  88.                     word_vector = word_vector / np.linalg.norm(word_vector)
  89.                     distribution_list.append(GetCos(word_vector, axes))
  90.                 except:
  91.                     pass
  92.             axes_distributions.append(distribution_list)
  93.            # print('{} - {} Arrays len: {}, {}, {}'.format(ph, opp, len(axes_distributions[0]), len(axes_distributions[1]), len(axes_distributions[2])))
  94.         projection_dict[ph][opp] = axes_distributions
  95. projection_dict
  96.  
  97. import scipy.stats as stats
  98. with open("projection_dict.pkl" , "rb") as file:
  99.     projection_dict = pickle.load(file)
  100.  
  101.    
  102. dist_dict ={}
  103.  
  104. def GetDist(elem):
  105.     if len(elem[0]) > 20 and len(elem[1]) > 20:
  106.         f_w = stats.mannwhitneyu(elem[0], elem[1])[1]
  107.     else :
  108.         f_w = None
  109.     if len(elem[1]) > 20 and len(elem[2]) > 20:
  110.         f_wo = stats.mannwhitneyu(elem[1], elem[2])[1]
  111.     else :
  112.         f_wo = None
  113.     if len(elem[0]) > 20 and len(elem[2]) > 20:
  114.         w_wo = stats.mannwhitneyu(elem[0], elem[2])[1]
  115.     else :
  116.         w_wo = None
  117.     return (("f_w" , f_w),("w_wo" , w_wo),("f_wo" , f_wo))
  118. for ph in projection_dict.keys():
  119.     dist_dict[ph] = {}
  120.     for opp in projection_dict[ph]:
  121.         dist_dict[ph][opp] = GetDist(projection_dict[ph][opp])
  122. dist_dict
  123.  
  124.  
  125. def GetNewTuple(elem):
  126.     res = []
  127.     if elem[0] != None and elem[0] < 0.001:
  128.         res.append(["first_with", elem[0]])
  129.     if elem[1] != None and elem[1] < 0.001:
  130.         res.append(["with_without",elem[1]])
  131.     if elem[2] != None and elem[2] < 0.001:
  132.         res.append(["first_without",elem[2]])  
  133.     return res
  134.  
  135. with open("dist_dict.pkl", "rb") as file:
  136.     data = pickle.load(file)
  137. result_thousand = {}
  138. for ph in data.keys():
  139.     result_thousand[ph] = {}
  140.     for opp in data[ph]:
  141.         if len(GetNewTuple(data[ph][opp]))!= 0 :
  142.             result_thousand[ph][opp] = GetNewTuple(data[ph][opp])
  143. result_thousand
  144. def GetString(elem):
  145.     elemstr = []
  146.     for e in elem:
  147.         elemstr.append(str(e))
  148.     res = " ".join(elemstr)
  149.     return res
  150.  
  151. for ph in result_thousand.keys():
  152.     for opp in result_thousand[ph].keys():
  153.         temp = GetString(result_thousand[ph][opp])
  154.         with open("thousands.txt" , "w") as file:
  155.             try:
  156.                 file.write(ph + " " + opp + " "+ temp)
  157.             except:
  158.                 pass
  159.             file.write("/n")
  160. file.close()
  161.  
  162. import shlex
  163. with open('thousands.txt', 'r') as f:
  164.  #   f_str = shlex.quote(f.read())
  165.     f_str = f.read().replace("," , " ")
  166.     f_str = "".join(c for c in f_str if c not in "\'")
  167.     f_str = "".join(c for c in f_str if c not in "\(")
  168.     f_str = "".join(c for c in f_str if c not in "\)")
  169.     f_str = "".join(c for c in f_str if c not in "\[")
  170.     f_str = "".join(c for c in f_str if c not in "\]")
  171.     f_str = "".join(c for c in f_str if c not in "\{")
  172.     f_str = "".join(c for c in f_str if c not in "\}")
  173.     file = open("thousands1.txt" , "w")
  174.     file.write(f_str)
  175.     file.close()
  176. f_str
  177.  
  178. def GetNewTuple(elem):
  179.     res = []
  180.     if elem[0] != None and elem[0] < 0.01:
  181.         res.append(["first_with", elem[0]])
  182.     if elem[1] != None and elem[1] < 0.01:
  183.         res.append(["with_without",elem[1]])
  184.     if elem[2] != None and elem[2] < 0.01:
  185.         res.append(["first_without",elem[2]])  
  186.     return res
  187.  
  188. with open("dist_dict.pkl", "rb") as file:
  189.     data = pickle.load(file)
  190. result_hundred = {}
  191. for ph in data.keys():
  192.     result_hundred[ph] = {}
  193.     for opp in data[ph]:
  194.         if len(GetNewTuple(data[ph][opp]))!= 0 :
  195.             result_hundred[ph][opp] = GetNewTuple(data[ph][opp])
  196. result_hundred
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top