Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # encoding = 'utf-8'
- # imports needed and logging
- import gzip
- import gensim
- import logging
- import help_functions as hf
- import nltk
- import codecs
- import sys
- import re
- import numpy as np
- logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- lemmatizer = nltk.WordNetLemmatizer()
- epochsNum = 1
- def my_split(s):
- # print(list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s))))
- # print(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
- return list(re.split("-?\d+.?\d*(?:[Ee]-\d+)?", s))[0] ,list(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
- # list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s)))
- # list(filter(None, re.split(r'(-?[0-9]\.\d*[eE][-+]?[0-9]+)', s)))
- dimension = 300 # parameter for Word2vec size of vectors for word embedding
- threshold = 0.00055 # parameter for Word2vec
- sentences = []
- file = open("lemmatized.text", "r")
- for line in file: # read the file and create list which contains all sentences found in the text
- sentences.append(line.split())
- # train word2vec on the two sentences
- # train all steps in one go
- model = gensim.models.Word2Vec(sentences, min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create model using Word2Ve with the given parameters
- print(model.wv, 'in one go')
- model.wv.save_word2vec_format('./GensimOneGo.txt', binary=False)
- print('saved')
- # sys.exit()
- w2vObject = gensim.models.Word2Vec(min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create only the shell
- print('Starting vocab build')
- # t = time()
- w2vObject.build_vocab(sentences, progress_per=10000) #here is the vocab being built as told in google groups gensim
- print(w2vObject.wv['the'], 'before train')
- f = codecs.open(f'../../../WordNetGraphHD/StorageEmbeddings/EmbeddingFormat{dimension}.txt', encoding='utf-8')##os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
- embeddings_index = {}
- for num, line in enumerate(f):
- values = my_split(line) # line.split('\t')
- word = values[0].rstrip()
- # vector = ''.join(num for num in values[1:])
- vector = values[1]
- # print(word, vector)
- if len(vector) != 300:
- print(line, 'here not 300')
- # print(my_split(line))
- # sys.exit()
- else:
- coefs = np.asarray(vector)
- # print(coefs.shape[0])
- # if coefs.shape[0] is not 300:
- # print(word, num)
- # np.asarray(values[1], dtype='float32') # weird flex but ok!
- # print(coefs, 'coefs as np array?')
- embeddings_index[word] = coefs
- # print(word)
- # sys.exit()
- # if 'be' in word:
- # print('it does', word)
- # print(word)
- # print('as array', np.fromstring(values[1].rstrip(), dtype='float32', sep=' '))
- # sys.exit()
- # print(word)
- # if useRI == True:
- # # print('using RI', vales[1:-1])
- # # print('strip \n ', values[1].rstrip())
- # coefs = np.fromstring(values[1].rstrip(), dtype='float32', sep=' ')
- # # print('Coefs success')
- # # print(coefs, 'coefs as np array?')
- # else:
- # coefs = np.asarray(values[1:], dtype='float32')
- # coefs = np.fromstring(vector.(), dtype='float32', sep=' ')
- # coefs = np.asarray(vector)
- # # print(coefs.shape[0])
- # # if coefs.shape[0] is not 300:
- # # print(word, num)
- # # np.asarray(values[1], dtype='float32') # weird flex but ok!
- # # print(coefs, 'coefs as np array?')
- # embeddings_index[word] = coefs
- # print('word', word, 'vector', str(coefs))
- # sys.exit()
- f.close()
- # print(embeddings_index)
- print('wnet', list(embeddings_index.keys())[:10], 'keys')
- # sys.exit()
- i = 0
- # print(embeddings_index.keys())
- for elem in embeddings_index.keys():
- found = False
- if embeddings_index[elem].shape[0] != 300:
- print(elem, embeddings_index[elem].shape[0])
- found = True
- else:
- found = False
- print('found one', found)
- print('now looking for common words')
- # sys.exit()
- i = 0
- for elem in w2vObject.wv.vocab:
- # print(elem)
- if elem in embeddings_index.keys():
- # print('in both', elem)
- print('be', w2vObject.wv[elem])
- print(embeddings_index[elem])
- w2vObject.wv[elem] = embeddings_index[elem]
- # print(w2vObject[elem], 'elem?')
- i += 1
- print('Found one', i)
- # sys.exit()
- print(i)
- w2vObject.train(sentences, total_examples=w2vObject.corpus_count, epochs=epochsNum)#w2vObject.iter)
- print(w2vObject.wv, 'after train')
- w2vObject.wv.save_word2vec_format('./GensimOneWNet.txt', binary=False) #encoding='utf-8' )
- print('saved')
- sys.exit()
Add Comment
Please, Sign In to add comment