SHARE
TWEET

Untitled

a guest Apr 24th, 2019 49 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bert_serving.client import BertClient
  2. import codecs
  3. from tqdm import tqdm
  4. import json
  5. import subprocess
  6. import numpy as np
  7.  
  8. vocabulary_path = '../../vocab.json'
  9. bert_vec_output_path = '../../bert_vec/bert_vect'
  10.  
  11. def get_bert_embed(vocabulary_path, bert_vec_output_path):
  12.  
  13.     def generate_bert_vec(words_list_, num_):
  14.         with open(bert_vec_output_path+  str(num_), "w") as output:
  15.             for w in tqdm(words_list_):
  16.                 vector = bc.encode([w])
  17.                 str_ = ''
  18.                 for i in vector.tolist()[0]:
  19.                     str_ += str(i) + ' '
  20.                 str_line = str_.strip(' ')
  21.                 output.write(w + " " + str_line + '\n')
  22.             output.close()
  23.  
  24.     bc = BertClient()
  25.     words_list = []
  26.     print("generate bert word vector...")
  27.     with open(vocabulary_path, "r") as vocab:
  28.         vocab_dict = json.load(vocab)
  29.  
  30.         for key in tqdm(vocab_dict.keys()):
  31.             if key == '' or key == '\u3000':  # \u3000     ''    ------>    <unk>
  32.                 key = '<unk>'
  33.                 words_list.append(key)
  34.             else:
  35.                 words_list.append(key)
  36.         vocab.close()
  37.         split_num = int(len(words_list) / 5000)
  38.         for num in range(0, split_num):
  39.             words_temp = words_list[num * 5000:(num + 1) * 5000]
  40.             generate_bert_vec(words_temp, num)
  41.             # p = multiprocessing.Process(target=generate_bert_vec, args=(words_temp, num))
  42.             print("start job...")
  43.             # p.start()
  44.         words_leftover = words_list[split_num * 5000:]
  45.         generate_bert_vec(words_leftover, split_num)
  46.  
  47. def cat_bert_vec():
  48.     home_name=bert_vec_output_path
  49.     file_name = ""
  50.     for i in range(81):
  51.         file_name += home_name + str(i) + " "
  52.     file_name = file_name.strip()
  53.     subprocess.call("cat " + file_name +" > ../../bert_vec/lic2019_bert_vector.768d.txt", shell=True)
  54.     print("finished! and are be saved at : ../../bert_vec/lic2019_bert_vector.768d.txt")
  55.  
  56.  
  57. def build_word_embedding_dict(path="../../bert_vec/lic2019_bert_vector.768d.txt"):
  58.  
  59.     f = open(path, "r", encoding="utf-8")
  60.     word_embedding_dict = {}
  61.     for line in f.readlines():
  62.         values = line.split()
  63.         wid_key = values[0]
  64.         wid_values = np.asarray(values[1:], dtype='float32')
  65.         word_embedding_dict[wid_key] = wid_values
  66.  
  67.     return word_embedding_dict
  68.  
  69.  
  70.  
  71. ###start
  72. get_bert_embed(vocabulary_path=vocabulary_path,bert_vec_output_path=bert_vec_output_path)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top