Untitled

import gensim

from gensim.models import KeyedVectors
# load the google word2vec model
filename = 'C:\\Users\\Миша\\Downloads\\GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(filename, binary=True)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

import numpy as np

word2vecdict = {}
for word in model.vocab:
    word2vecdict[word] = np.array(model[word])
print(word2vecdict["king"])

with open("word2vecdict.pkl", "wb") as out_file:
    pickle.dump(word2vecdict, out_file, protocol=pickle.HIGHEST_PROTOCOL)

import pickle

word2phonesvec = {}
with open("C:\\Users\\Миша\\Desktop\\word2phones.pkl" , "rb") as file:
    data = pickle.load(file)
for word in data.keys():
    if word in word2vecdict.keys():
        word2phonesvec[word] = (data[word], word2vecdict[word])
print(word2phonesvec["is"])

adj_vec = {}
temp_word = []
temp_vec = []
for adj in adj_dic.keys():
    temp_word = adj_dic[adj]
    for word in temp_word:
        try:
            temp_vec.append(word2vecdict[word])
        except Exception:
            pass
    adj_vec[adj] = np.sum(temp_vec, axis = 0)
adj_vec

opposite_dict = {}
opposite_dict["good_bad"] = adj_vec["Good"] - adj_vec["Bad"]
opposite_dict["angular_round"] = adj_vec["Angular"] - adj_vec["Round"]
opposite_dict["cold_hot"] = adj_vec["Cold"] - adj_vec["Hot"]
opposite_dict["complex_simple"] = adj_vec["Complex"] - adj_vec["Simple"]
opposite_dict["cowardly_brave"] = adj_vec["Cowardly"] - adj_vec["Brave"]
opposite_dict["dangerous_secure"] = adj_vec["Dangerous"] - adj_vec["Secure"]
opposite_dict["luminous_dark"] = adj_vec["Luminous"] - adj_vec["Dark"]
opposite_dict["difficult_easy"] = adj_vec["Difficult"] - adj_vec["Easy"]
opposite_dict["benign_evil"] = adj_vec["Benign"] - adj_vec["Evil"]
opposite_dict["bright_faded"] = adj_vec["Bright"] - adj_vec["Faded"]
opposite_dict["strong_feeble"] = adj_vec["Strong"] - adj_vec["Feeble"]
opposite_dict["masculine_feminine"] = adj_vec["Masculine"] - adj_vec["Feminine"]
opposite_dict["passive_active"] = adj_vec["Passive"] - adj_vec["Active"]
opposite_dict["quiet_loud"] = adj_vec["Quiet"] - adj_vec["Loud"]
opposite_dict["rough_tender"] = adj_vec["Rough"] - adj_vec["Tender"]
opposite_dict["joyful_sad"] = adj_vec["Joyful"] - adj_vec["Sad"]
opposite_dict["short_long"] = adj_vec["Short"] - adj_vec["Long"]
opposite_dict["quick_slow"] = adj_vec["Quick"] - adj_vec["Slow"]
opposite_dict["small_big"] = adj_vec["Small"] - adj_vec["Big"]
opposite_dict["rough_or_not_smooth_smooth"] = adj_vec["Rough_or_not_smooth"] - adj_vec["Smooth"]
opposite_dict["beautiful_ugly"] = adj_vec["Beautiful"] - adj_vec["Ugly"]
opposite_dict

with open("sounds_in_words_dict.pkl" , "rb") as file:
    data = pickle.load(file)
data

def GetCos(v1 , v2):
    m = np.dot(v1, v2)
    return m

projection_dict = {}
for ph in data.keys():
    projection_dict[ph] = {}
    for opp in opposite_dict.keys():
        axes = opposite_dict[opp]
        axes = axes / np.linalg.norm(axes)
        axes_distributions = []
        for arr in data[ph]:
            distribution_list = []
            for word in arr:
                try:
                    word_vector = np.array(list(word2vecdict[word]))
                    word_vector = word_vector / np.linalg.norm(word_vector)
                    distribution_list.append(GetCos(word_vector, axes))
                except:
                    pass
            axes_distributions.append(distribution_list)
           # print('{} - {} Arrays len: {}, {}, {}'.format(ph, opp, len(axes_distributions[0]), len(axes_distributions[1]), len(axes_distributions[2])))
        projection_dict[ph][opp] = axes_distributions
projection_dict

import scipy.stats as stats
with open("projection_dict.pkl" , "rb") as file:
    projection_dict = pickle.load(file)


dist_dict ={}

def GetDist(elem):
    if len(elem[0]) > 20 and len(elem[1]) > 20:
        f_w = stats.mannwhitneyu(elem[0], elem[1])[1]
    else :
        f_w = None
    if len(elem[1]) > 20 and len(elem[2]) > 20:
        f_wo = stats.mannwhitneyu(elem[1], elem[2])[1]
    else :
        f_wo = None
    if len(elem[0]) > 20 and len(elem[2]) > 20:
        w_wo = stats.mannwhitneyu(elem[0], elem[2])[1]
    else :
        w_wo = None
    return (("f_w" , f_w),("w_wo" , w_wo),("f_wo" , f_wo))
for ph in projection_dict.keys():
    dist_dict[ph] = {}
    for opp in projection_dict[ph]:
        dist_dict[ph][opp] = GetDist(projection_dict[ph][opp])
dist_dict


def GetNewTuple(elem):
    res = []
    if elem[0] != None and elem[0] < 0.001:
        res.append(["first_with", elem[0]])
    if elem[1] != None and elem[1] < 0.001:
        res.append(["with_without",elem[1]])
    if elem[2] != None and elem[2] < 0.001:
        res.append(["first_without",elem[2]])
    return res

with open("dist_dict.pkl", "rb") as file:
    data = pickle.load(file)
result_thousand = {}
for ph in data.keys():
    result_thousand[ph] = {}
    for opp in data[ph]:
        if len(GetNewTuple(data[ph][opp]))!= 0 :
            result_thousand[ph][opp] = GetNewTuple(data[ph][opp])
result_thousand
def GetString(elem):
    elemstr = []
    for e in elem:
        elemstr.append(str(e))
    res = " ".join(elemstr)
    return res

for ph in result_thousand.keys():
    for opp in result_thousand[ph].keys():
        temp = GetString(result_thousand[ph][opp])
        with open("thousands.txt" , "w") as file:
            try:
                file.write(ph + " " + opp + " "+ temp)
            except:
                pass
            file.write("/n")
file.close()

import shlex
with open('thousands.txt', 'r') as f:
 #   f_str = shlex.quote(f.read())
    f_str = f.read().replace("," , " ")
    f_str = "".join(c for c in f_str if c not in "\'")
    f_str = "".join(c for c in f_str if c not in "\(")
    f_str = "".join(c for c in f_str if c not in "\)")
    f_str = "".join(c for c in f_str if c not in "\[")
    f_str = "".join(c for c in f_str if c not in "\]")
    f_str = "".join(c for c in f_str if c not in "\{")
    f_str = "".join(c for c in f_str if c not in "\}")
    file = open("thousands1.txt" , "w")
    file.write(f_str)
    file.close()
f_str

def GetNewTuple(elem):
    res = []
    if elem[0] != None and elem[0] < 0.01:
        res.append(["first_with", elem[0]])
    if elem[1] != None and elem[1] < 0.01:
        res.append(["with_without",elem[1]])
    if elem[2] != None and elem[2] < 0.01:
        res.append(["first_without",elem[2]])
    return res

with open("dist_dict.pkl", "rb") as file:
    data = pickle.load(file)
result_hundred = {}
for ph in data.keys():
    result_hundred[ph] = {}
    for opp in data[ph]:
        if len(GetNewTuple(data[ph][opp]))!= 0 :
            result_hundred[ph][opp] = GetNewTuple(data[ph][opp])
result_hundred