Guest User

Untitled

a guest
Dec 10th, 2019
86
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import gensim
  2.  
  3. from gensim.models import KeyedVectors
  4. # load the google word2vec model
  5. filename = 'C:\\Users\\Миша\\Downloads\\GoogleNews-vectors-negative300.bin.gz'
  6. model = KeyedVectors.load_word2vec_format(filename, binary=True)
  7. # calculate: (king - man) + woman = ?
  8. result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
  9. print(result)
  10.  
  11. import numpy as np
  12.  
  13. word2vecdict = {}
  14. for word in model.vocab:
  15. word2vecdict[word] = np.array(model[word])
  16. print(word2vecdict["king"])
  17.  
  18. with open("word2vecdict.pkl", "wb") as out_file:
  19. pickle.dump(word2vecdict, out_file, protocol=pickle.HIGHEST_PROTOCOL)
  20.  
  21. import pickle
  22.  
  23. word2phonesvec = {}
  24. with open("C:\\Users\\Миша\\Desktop\\word2phones.pkl" , "rb") as file:
  25. data = pickle.load(file)
  26. for word in data.keys():
  27. if word in word2vecdict.keys():
  28. word2phonesvec[word] = (data[word], word2vecdict[word])
  29. print(word2phonesvec["is"])
  30.  
  31. adj_vec = {}
  32. temp_word = []
  33. temp_vec = []
  34. for adj in adj_dic.keys():
  35. temp_word = adj_dic[adj]
  36. for word in temp_word:
  37. try:
  38. temp_vec.append(word2vecdict[word])
  39. except Exception:
  40. pass
  41. adj_vec[adj] = np.sum(temp_vec, axis = 0)
  42. adj_vec
  43.  
  44. opposite_dict = {}
  45. opposite_dict["good_bad"] = adj_vec["Good"] - adj_vec["Bad"]
  46. opposite_dict["angular_round"] = adj_vec["Angular"] - adj_vec["Round"]
  47. opposite_dict["cold_hot"] = adj_vec["Cold"] - adj_vec["Hot"]
  48. opposite_dict["complex_simple"] = adj_vec["Complex"] - adj_vec["Simple"]
  49. opposite_dict["cowardly_brave"] = adj_vec["Cowardly"] - adj_vec["Brave"]
  50. opposite_dict["dangerous_secure"] = adj_vec["Dangerous"] - adj_vec["Secure"]
  51. opposite_dict["luminous_dark"] = adj_vec["Luminous"] - adj_vec["Dark"]
  52. opposite_dict["difficult_easy"] = adj_vec["Difficult"] - adj_vec["Easy"]
  53. opposite_dict["benign_evil"] = adj_vec["Benign"] - adj_vec["Evil"]
  54. opposite_dict["bright_faded"] = adj_vec["Bright"] - adj_vec["Faded"]
  55. opposite_dict["strong_feeble"] = adj_vec["Strong"] - adj_vec["Feeble"]
  56. opposite_dict["masculine_feminine"] = adj_vec["Masculine"] - adj_vec["Feminine"]
  57. opposite_dict["passive_active"] = adj_vec["Passive"] - adj_vec["Active"]
  58. opposite_dict["quiet_loud"] = adj_vec["Quiet"] - adj_vec["Loud"]
  59. opposite_dict["rough_tender"] = adj_vec["Rough"] - adj_vec["Tender"]
  60. opposite_dict["joyful_sad"] = adj_vec["Joyful"] - adj_vec["Sad"]
  61. opposite_dict["short_long"] = adj_vec["Short"] - adj_vec["Long"]
  62. opposite_dict["quick_slow"] = adj_vec["Quick"] - adj_vec["Slow"]
  63. opposite_dict["small_big"] = adj_vec["Small"] - adj_vec["Big"]
  64. opposite_dict["rough_or_not_smooth_smooth"] = adj_vec["Rough_or_not_smooth"] - adj_vec["Smooth"]
  65. opposite_dict["beautiful_ugly"] = adj_vec["Beautiful"] - adj_vec["Ugly"]
  66. opposite_dict
  67.  
  68. with open("sounds_in_words_dict.pkl" , "rb") as file:
  69. data = pickle.load(file)
  70. data
  71.  
  72. def GetCos(v1 , v2):
  73. m = np.dot(v1, v2)
  74. return m
  75.  
  76. projection_dict = {}
  77. for ph in data.keys():
  78. projection_dict[ph] = {}
  79. for opp in opposite_dict.keys():
  80. axes = opposite_dict[opp]
  81. axes = axes / np.linalg.norm(axes)
  82. axes_distributions = []
  83. for arr in data[ph]:
  84. distribution_list = []
  85. for word in arr:
  86. try:
  87. word_vector = np.array(list(word2vecdict[word]))
  88. word_vector = word_vector / np.linalg.norm(word_vector)
  89. distribution_list.append(GetCos(word_vector, axes))
  90. except:
  91. pass
  92. axes_distributions.append(distribution_list)
  93. # print('{} - {} Arrays len: {}, {}, {}'.format(ph, opp, len(axes_distributions[0]), len(axes_distributions[1]), len(axes_distributions[2])))
  94. projection_dict[ph][opp] = axes_distributions
  95. projection_dict
  96.  
  97. import scipy.stats as stats
  98. with open("projection_dict.pkl" , "rb") as file:
  99. projection_dict = pickle.load(file)
  100.  
  101.  
  102. dist_dict ={}
  103.  
  104. def GetDist(elem):
  105. if len(elem[0]) > 20 and len(elem[1]) > 20:
  106. f_w = stats.mannwhitneyu(elem[0], elem[1])[1]
  107. else :
  108. f_w = None
  109. if len(elem[1]) > 20 and len(elem[2]) > 20:
  110. f_wo = stats.mannwhitneyu(elem[1], elem[2])[1]
  111. else :
  112. f_wo = None
  113. if len(elem[0]) > 20 and len(elem[2]) > 20:
  114. w_wo = stats.mannwhitneyu(elem[0], elem[2])[1]
  115. else :
  116. w_wo = None
  117. return (("f_w" , f_w),("w_wo" , w_wo),("f_wo" , f_wo))
  118. for ph in projection_dict.keys():
  119. dist_dict[ph] = {}
  120. for opp in projection_dict[ph]:
  121. dist_dict[ph][opp] = GetDist(projection_dict[ph][opp])
  122. dist_dict
  123.  
  124.  
  125. def GetNewTuple(elem):
  126. res = []
  127. if elem[0] != None and elem[0] < 0.001:
  128. res.append(["first_with", elem[0]])
  129. if elem[1] != None and elem[1] < 0.001:
  130. res.append(["with_without",elem[1]])
  131. if elem[2] != None and elem[2] < 0.001:
  132. res.append(["first_without",elem[2]])
  133. return res
  134.  
  135. with open("dist_dict.pkl", "rb") as file:
  136. data = pickle.load(file)
  137. result_thousand = {}
  138. for ph in data.keys():
  139. result_thousand[ph] = {}
  140. for opp in data[ph]:
  141. if len(GetNewTuple(data[ph][opp]))!= 0 :
  142. result_thousand[ph][opp] = GetNewTuple(data[ph][opp])
  143. result_thousand
  144. def GetString(elem):
  145. elemstr = []
  146. for e in elem:
  147. elemstr.append(str(e))
  148. res = " ".join(elemstr)
  149. return res
  150.  
  151. for ph in result_thousand.keys():
  152. for opp in result_thousand[ph].keys():
  153. temp = GetString(result_thousand[ph][opp])
  154. with open("thousands.txt" , "w") as file:
  155. try:
  156. file.write(ph + " " + opp + " "+ temp)
  157. except:
  158. pass
  159. file.write("/n")
  160. file.close()
  161.  
  162. import shlex
  163. with open('thousands.txt', 'r') as f:
  164. # f_str = shlex.quote(f.read())
  165. f_str = f.read().replace("," , " ")
  166. f_str = "".join(c for c in f_str if c not in "\'")
  167. f_str = "".join(c for c in f_str if c not in "\(")
  168. f_str = "".join(c for c in f_str if c not in "\)")
  169. f_str = "".join(c for c in f_str if c not in "\[")
  170. f_str = "".join(c for c in f_str if c not in "\]")
  171. f_str = "".join(c for c in f_str if c not in "\{")
  172. f_str = "".join(c for c in f_str if c not in "\}")
  173. file = open("thousands1.txt" , "w")
  174. file.write(f_str)
  175. file.close()
  176. f_str
  177.  
  178. def GetNewTuple(elem):
  179. res = []
  180. if elem[0] != None and elem[0] < 0.01:
  181. res.append(["first_with", elem[0]])
  182. if elem[1] != None and elem[1] < 0.01:
  183. res.append(["with_without",elem[1]])
  184. if elem[2] != None and elem[2] < 0.01:
  185. res.append(["first_without",elem[2]])
  186. return res
  187.  
  188. with open("dist_dict.pkl", "rb") as file:
  189. data = pickle.load(file)
  190. result_hundred = {}
  191. for ph in data.keys():
  192. result_hundred[ph] = {}
  193. for opp in data[ph]:
  194. if len(GetNewTuple(data[ph][opp]))!= 0 :
  195. result_hundred[ph][opp] = GetNewTuple(data[ph][opp])
  196. result_hundred
RAW Paste Data