PedroAlonso

Code GensimCrash

May 19th, 2020
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.70 KB | None | 0 0
  1. # encoding = 'utf-8'
  2. # imports needed and logging
  3. import gzip
  4. import gensim
  5. import logging
  6. import help_functions as hf
  7. import nltk
  8. import codecs
  9. import sys
  10. import re
  11. import numpy as np
  12.  
  13. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  14.  
  15. lemmatizer = nltk.WordNetLemmatizer()
  16. epochsNum = 1
  17. def my_split(s):
  18. # print(list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s))))
  19. # print(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
  20. return list(re.split("-?\d+.?\d*(?:[Ee]-\d+)?", s))[0] ,list(re.findall("-?\d+.?\d*(?:[Ee]-\d+)?", s))
  21. # list(filter(None, re.split(r'-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?', s)))
  22. # list(filter(None, re.split(r'(-?[0-9]\.\d*[eE][-+]?[0-9]+)', s)))
  23.  
  24. dimension = 300 # parameter for Word2vec size of vectors for word embedding
  25.  
  26. threshold = 0.00055 # parameter for Word2vec
  27.  
  28. sentences = []
  29. file = open("lemmatized.text", "r")
  30. for line in file: # read the file and create list which contains all sentences found in the text
  31. sentences.append(line.split())
  32. # train word2vec on the two sentences
  33.  
  34. # train all steps in one go
  35. model = gensim.models.Word2Vec(sentences, min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create model using Word2Ve with the given parameters
  36.  
  37. print(model.wv, 'in one go')
  38.  
  39. model.wv.save_word2vec_format('./GensimOneGo.txt', binary=False)
  40. print('saved')
  41. # sys.exit()
  42. w2vObject = gensim.models.Word2Vec(min_count=1, sample=threshold, sg=1,size=dimension, negative=15, iter=epochsNum, window=3) # create only the shell
  43.  
  44. print('Starting vocab build')
  45. # t = time()
  46. w2vObject.build_vocab(sentences, progress_per=10000) #here is the vocab being built as told in google groups gensim
  47.  
  48. print(w2vObject.wv['the'], 'before train')
  49.  
  50.  
  51. f = codecs.open(f'../../../WordNetGraphHD/StorageEmbeddings/EmbeddingFormat{dimension}.txt', encoding='utf-8')##os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
  52. embeddings_index = {}
  53. for num, line in enumerate(f):
  54. values = my_split(line) # line.split('\t')
  55. word = values[0].rstrip()
  56. # vector = ''.join(num for num in values[1:])
  57. vector = values[1]
  58. # print(word, vector)
  59. if len(vector) != 300:
  60. print(line, 'here not 300')
  61. # print(my_split(line))
  62. # sys.exit()
  63.  
  64. else:
  65. coefs = np.asarray(vector)
  66. # print(coefs.shape[0])
  67. # if coefs.shape[0] is not 300:
  68. # print(word, num)
  69. # np.asarray(values[1], dtype='float32') # weird flex but ok!
  70. # print(coefs, 'coefs as np array?')
  71. embeddings_index[word] = coefs
  72. # print(word)
  73. # sys.exit()
  74. # if 'be' in word:
  75. # print('it does', word)
  76. # print(word)
  77. # print('as array', np.fromstring(values[1].rstrip(), dtype='float32', sep=' '))
  78. # sys.exit()
  79. # print(word)
  80. # if useRI == True:
  81. # # print('using RI', vales[1:-1])
  82. # # print('strip \n ', values[1].rstrip())
  83. # coefs = np.fromstring(values[1].rstrip(), dtype='float32', sep=' ')
  84. # # print('Coefs success')
  85. # # print(coefs, 'coefs as np array?')
  86. # else:
  87. # coefs = np.asarray(values[1:], dtype='float32')
  88. # coefs = np.fromstring(vector.(), dtype='float32', sep=' ')
  89. # coefs = np.asarray(vector)
  90. # # print(coefs.shape[0])
  91. # # if coefs.shape[0] is not 300:
  92. # # print(word, num)
  93. # # np.asarray(values[1], dtype='float32') # weird flex but ok!
  94. # # print(coefs, 'coefs as np array?')
  95. # embeddings_index[word] = coefs
  96. # print('word', word, 'vector', str(coefs))
  97. # sys.exit()
  98. f.close()
  99.  
  100. # print(embeddings_index)
  101. print('wnet', list(embeddings_index.keys())[:10], 'keys')
  102. # sys.exit()
  103.  
  104. i = 0
  105. # print(embeddings_index.keys())
  106. for elem in embeddings_index.keys():
  107. found = False
  108. if embeddings_index[elem].shape[0] != 300:
  109. print(elem, embeddings_index[elem].shape[0])
  110. found = True
  111. else:
  112. found = False
  113. print('found one', found)
  114. print('now looking for common words')
  115. # sys.exit()
  116. i = 0
  117. for elem in w2vObject.wv.vocab:
  118. # print(elem)
  119. if elem in embeddings_index.keys():
  120. # print('in both', elem)
  121. print('be', w2vObject.wv[elem])
  122. print(embeddings_index[elem])
  123. w2vObject.wv[elem] = embeddings_index[elem]
  124. # print(w2vObject[elem], 'elem?')
  125. i += 1
  126. print('Found one', i)
  127. # sys.exit()
  128.  
  129. print(i)
  130.  
  131.  
  132.  
  133. w2vObject.train(sentences, total_examples=w2vObject.corpus_count, epochs=epochsNum)#w2vObject.iter)
  134.  
  135. print(w2vObject.wv, 'after train')
  136. w2vObject.wv.save_word2vec_format('./GensimOneWNet.txt', binary=False) #encoding='utf-8' )
  137. print('saved')
  138. sys.exit()
Add Comment
Please, Sign In to add comment