Code GensimCrash

# encoding = 'utf-8'
# imports needed and logging
import gzip
import gensim
import logging
import help_functions as hf
import nltk
import codecs
import sys
import re
import numpy as np

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

lemmatizer = nltk.WordNetLemmatizer()
epochsNum = 1
def my_split(s):
    # print(list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s))))
    # print(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
    return list(re.split("-?\d+.?\d*(?:[Ee]-\d+)?", s))[0] ,list(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
    # list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s)))
    # list(filter(None, re.split(r'(-?[0-9]\.\d*[eE][-+]?[0-9]+)', s)))

dimension = 300 # parameter for Word2vec size of vectors for word embedding

threshold = 0.00055 # parameter for Word2vec

sentences = []
file = open("lemmatized.text", "r")
for line in file: # read the file and create list which contains all sentences found in the text
    sentences.append(line.split())
# train word2vec on the two sentences

# train all steps in one go
model = gensim.models.Word2Vec(sentences, min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create model using Word2Ve with the given parameters

print(model.wv, 'in one go')

model.wv.save_word2vec_format('./GensimOneGo.txt', binary=False)
print('saved')
# sys.exit()
w2vObject = gensim.models.Word2Vec(min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create only the shell

print('Starting vocab build')
# t = time()
w2vObject.build_vocab(sentences, progress_per=10000) #here is the vocab being built as told in google groups gensim

print(w2vObject.wv['the'], 'before train')


f = codecs.open(f'../../../WordNetGraphHD/StorageEmbeddings/EmbeddingFormat{dimension}.txt', encoding='utf-8')##os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
embeddings_index = {}
for num, line in enumerate(f):
    values = my_split(line) # line.split('\t')
    word = values[0].rstrip()
    # vector = ''.join(num for num in values[1:])
    vector = values[1]
    # print(word, vector)
    if len(vector) != 300:
        print(line, 'here not 300')
        # print(my_split(line))
        # sys.exit()

    else:
        coefs = np.asarray(vector)
    # print(coefs.shape[0])
    # if coefs.shape[0] is not 300:
        # print(word, num)
            # np.asarray(values[1], dtype='float32') # weird flex but ok!
        # print(coefs, 'coefs as np array?')
        embeddings_index[word] = coefs
    # print(word)
    # sys.exit()
    # if 'be' in word:
    #     print('it does', word)
    # print(word)
    # print('as array', np.fromstring(values[1].rstrip(), dtype='float32', sep=' '))
    # sys.exit()
    # print(word)
    # if useRI == True:
    #     # print('using RI', vales[1:-1])
    #     # print('strip \n ', values[1].rstrip())
    #     coefs = np.fromstring(values[1].rstrip(), dtype='float32', sep=' ')
    #     # print('Coefs success')
    #     # print(coefs, 'coefs as np array?')
    # else:
    #     coefs = np.asarray(values[1:], dtype='float32')
    # coefs = np.fromstring(vector.(), dtype='float32', sep=' ')
    # coefs = np.asarray(vector)
    # # print(coefs.shape[0])
    # # if coefs.shape[0] is not 300:
    #     # print(word, num)
    #         # np.asarray(values[1], dtype='float32') # weird flex but ok!
    #     # print(coefs, 'coefs as np array?')
    # embeddings_index[word] = coefs
    # print('word', word, 'vector', str(coefs))
# sys.exit()
f.close()

# print(embeddings_index)
print('wnet', list(embeddings_index.keys())[:10], 'keys')
# sys.exit()

i = 0
# print(embeddings_index.keys())
for elem in embeddings_index.keys():
    found = False
    if embeddings_index[elem].shape[0] != 300:
        print(elem, embeddings_index[elem].shape[0])
        found = True
    else:
        found = False
    print('found one', found)
print('now looking for common words')
# sys.exit()
i = 0
for elem in w2vObject.wv.vocab:
    # print(elem)
    if elem in embeddings_index.keys():
        # print('in both', elem)
        print('be', w2vObject.wv[elem])
        print(embeddings_index[elem])
        w2vObject.wv[elem] = embeddings_index[elem]
        # print(w2vObject[elem], 'elem?')
        i += 1
        print('Found one', i)
        # sys.exit()

print(i)


w2vObject.train(sentences, total_examples=w2vObject.corpus_count, epochs=epochsNum)#w2vObject.iter)

print(w2vObject.wv, 'after train')
w2vObject.wv.save_word2vec_format('./GensimOneWNet.txt', binary=False) #encoding='utf-8' )
print('saved')
sys.exit()