Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gensim
- from gensim.models import KeyedVectors
- # load the google word2vec model
- filename = 'C:\\Users\\Миша\\Downloads\\GoogleNews-vectors-negative300.bin.gz'
- model = KeyedVectors.load_word2vec_format(filename, binary=True)
- # calculate: (king - man) + woman = ?
- result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
- print(result)
- import numpy as np
- word2vecdict = {}
- for word in model.vocab:
- word2vecdict[word] = np.array(model[word])
- print(word2vecdict["king"])
- with open("word2vecdict.pkl", "wb") as out_file:
- pickle.dump(word2vecdict, out_file, protocol=pickle.HIGHEST_PROTOCOL)
- import pickle
- word2phonesvec = {}
- with open("C:\\Users\\Миша\\Desktop\\word2phones.pkl" , "rb") as file:
- data = pickle.load(file)
- for word in data.keys():
- if word in word2vecdict.keys():
- word2phonesvec[word] = (data[word], word2vecdict[word])
- print(word2phonesvec["is"])
- adj_vec = {}
- temp_word = []
- temp_vec = []
- for adj in adj_dic.keys():
- temp_word = adj_dic[adj]
- for word in temp_word:
- try:
- temp_vec.append(word2vecdict[word])
- except Exception:
- pass
- adj_vec[adj] = np.sum(temp_vec, axis = 0)
- adj_vec
- opposite_dict = {}
- opposite_dict["good_bad"] = adj_vec["Good"] - adj_vec["Bad"]
- opposite_dict["angular_round"] = adj_vec["Angular"] - adj_vec["Round"]
- opposite_dict["cold_hot"] = adj_vec["Cold"] - adj_vec["Hot"]
- opposite_dict["complex_simple"] = adj_vec["Complex"] - adj_vec["Simple"]
- opposite_dict["cowardly_brave"] = adj_vec["Cowardly"] - adj_vec["Brave"]
- opposite_dict["dangerous_secure"] = adj_vec["Dangerous"] - adj_vec["Secure"]
- opposite_dict["luminous_dark"] = adj_vec["Luminous"] - adj_vec["Dark"]
- opposite_dict["difficult_easy"] = adj_vec["Difficult"] - adj_vec["Easy"]
- opposite_dict["benign_evil"] = adj_vec["Benign"] - adj_vec["Evil"]
- opposite_dict["bright_faded"] = adj_vec["Bright"] - adj_vec["Faded"]
- opposite_dict["strong_feeble"] = adj_vec["Strong"] - adj_vec["Feeble"]
- opposite_dict["masculine_feminine"] = adj_vec["Masculine"] - adj_vec["Feminine"]
- opposite_dict["passive_active"] = adj_vec["Passive"] - adj_vec["Active"]
- opposite_dict["quiet_loud"] = adj_vec["Quiet"] - adj_vec["Loud"]
- opposite_dict["rough_tender"] = adj_vec["Rough"] - adj_vec["Tender"]
- opposite_dict["joyful_sad"] = adj_vec["Joyful"] - adj_vec["Sad"]
- opposite_dict["short_long"] = adj_vec["Short"] - adj_vec["Long"]
- opposite_dict["quick_slow"] = adj_vec["Quick"] - adj_vec["Slow"]
- opposite_dict["small_big"] = adj_vec["Small"] - adj_vec["Big"]
- opposite_dict["rough_or_not_smooth_smooth"] = adj_vec["Rough_or_not_smooth"] - adj_vec["Smooth"]
- opposite_dict["beautiful_ugly"] = adj_vec["Beautiful"] - adj_vec["Ugly"]
- opposite_dict
- with open("sounds_in_words_dict.pkl" , "rb") as file:
- data = pickle.load(file)
- data
- def GetCos(v1 , v2):
- m = np.dot(v1, v2)
- return m
- projection_dict = {}
- for ph in data.keys():
- projection_dict[ph] = {}
- for opp in opposite_dict.keys():
- axes = opposite_dict[opp]
- axes = axes / np.linalg.norm(axes)
- axes_distributions = []
- for arr in data[ph]:
- distribution_list = []
- for word in arr:
- try:
- word_vector = np.array(list(word2vecdict[word]))
- word_vector = word_vector / np.linalg.norm(word_vector)
- distribution_list.append(GetCos(word_vector, axes))
- except:
- pass
- axes_distributions.append(distribution_list)
- # print('{} - {} Arrays len: {}, {}, {}'.format(ph, opp, len(axes_distributions[0]), len(axes_distributions[1]), len(axes_distributions[2])))
- projection_dict[ph][opp] = axes_distributions
- projection_dict
- import scipy.stats as stats
- with open("projection_dict.pkl" , "rb") as file:
- projection_dict = pickle.load(file)
- dist_dict ={}
- def GetDist(elem):
- if len(elem[0]) > 20 and len(elem[1]) > 20:
- f_w = stats.mannwhitneyu(elem[0], elem[1])[1]
- else :
- f_w = None
- if len(elem[1]) > 20 and len(elem[2]) > 20:
- f_wo = stats.mannwhitneyu(elem[1], elem[2])[1]
- else :
- f_wo = None
- if len(elem[0]) > 20 and len(elem[2]) > 20:
- w_wo = stats.mannwhitneyu(elem[0], elem[2])[1]
- else :
- w_wo = None
- return (("f_w" , f_w),("w_wo" , w_wo),("f_wo" , f_wo))
- for ph in projection_dict.keys():
- dist_dict[ph] = {}
- for opp in projection_dict[ph]:
- dist_dict[ph][opp] = GetDist(projection_dict[ph][opp])
- dist_dict
- def GetNewTuple(elem):
- res = []
- if elem[0] != None and elem[0] < 0.001:
- res.append(["first_with", elem[0]])
- if elem[1] != None and elem[1] < 0.001:
- res.append(["with_without",elem[1]])
- if elem[2] != None and elem[2] < 0.001:
- res.append(["first_without",elem[2]])
- return res
- with open("dist_dict.pkl", "rb") as file:
- data = pickle.load(file)
- result_thousand = {}
- for ph in data.keys():
- result_thousand[ph] = {}
- for opp in data[ph]:
- if len(GetNewTuple(data[ph][opp]))!= 0 :
- result_thousand[ph][opp] = GetNewTuple(data[ph][opp])
- result_thousand
- def GetString(elem):
- elemstr = []
- for e in elem:
- elemstr.append(str(e))
- res = " ".join(elemstr)
- return res
- for ph in result_thousand.keys():
- for opp in result_thousand[ph].keys():
- temp = GetString(result_thousand[ph][opp])
- with open("thousands.txt" , "w") as file:
- try:
- file.write(ph + " " + opp + " "+ temp)
- except:
- pass
- file.write("/n")
- file.close()
- import shlex
- with open('thousands.txt', 'r') as f:
- # f_str = shlex.quote(f.read())
- f_str = f.read().replace("," , " ")
- f_str = "".join(c for c in f_str if c not in "\'")
- f_str = "".join(c for c in f_str if c not in "\(")
- f_str = "".join(c for c in f_str if c not in "\)")
- f_str = "".join(c for c in f_str if c not in "\[")
- f_str = "".join(c for c in f_str if c not in "\]")
- f_str = "".join(c for c in f_str if c not in "\{")
- f_str = "".join(c for c in f_str if c not in "\}")
- file = open("thousands1.txt" , "w")
- file.write(f_str)
- file.close()
- f_str
- def GetNewTuple(elem):
- res = []
- if elem[0] != None and elem[0] < 0.01:
- res.append(["first_with", elem[0]])
- if elem[1] != None and elem[1] < 0.01:
- res.append(["with_without",elem[1]])
- if elem[2] != None and elem[2] < 0.01:
- res.append(["first_without",elem[2]])
- return res
- with open("dist_dict.pkl", "rb") as file:
- data = pickle.load(file)
- result_hundred = {}
- for ph in data.keys():
- result_hundred[ph] = {}
- for opp in data[ph]:
- if len(GetNewTuple(data[ph][opp]))!= 0 :
- result_hundred[ph][opp] = GetNewTuple(data[ph][opp])
- result_hundred
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement