Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bert_serving.client import BertClient
- import codecs
- from tqdm import tqdm
- import json
- import subprocess
- import numpy as np
- vocabulary_path = '../../vocab.json'
- bert_vec_output_path = '../../bert_vec/bert_vect'
- def get_bert_embed(vocabulary_path, bert_vec_output_path):
- def generate_bert_vec(words_list_, num_):
- with open(bert_vec_output_path+ str(num_), "w") as output:
- for w in tqdm(words_list_):
- vector = bc.encode([w])
- str_ = ''
- for i in vector.tolist()[0]:
- str_ += str(i) + ' '
- str_line = str_.strip(' ')
- output.write(w + " " + str_line + '\n')
- output.close()
- bc = BertClient()
- words_list = []
- print("generate bert word vector...")
- with open(vocabulary_path, "r") as vocab:
- vocab_dict = json.load(vocab)
- for key in tqdm(vocab_dict.keys()):
- if key == '' or key == '\u3000': # \u3000 '' ------> <unk>
- key = '<unk>'
- words_list.append(key)
- else:
- words_list.append(key)
- vocab.close()
- split_num = int(len(words_list) / 5000)
- for num in range(0, split_num):
- words_temp = words_list[num * 5000:(num + 1) * 5000]
- generate_bert_vec(words_temp, num)
- # p = multiprocessing.Process(target=generate_bert_vec, args=(words_temp, num))
- print("start job...")
- # p.start()
- words_leftover = words_list[split_num * 5000:]
- generate_bert_vec(words_leftover, split_num)
- def cat_bert_vec():
- home_name=bert_vec_output_path
- file_name = ""
- for i in range(81):
- file_name += home_name + str(i) + " "
- file_name = file_name.strip()
- subprocess.call("cat " + file_name +" > ../../bert_vec/lic2019_bert_vector.768d.txt", shell=True)
- print("finished! and are be saved at : ../../bert_vec/lic2019_bert_vector.768d.txt")
- def build_word_embedding_dict(path="../../bert_vec/lic2019_bert_vector.768d.txt"):
- f = open(path, "r", encoding="utf-8")
- word_embedding_dict = {}
- for line in f.readlines():
- values = line.split()
- wid_key = values[0]
- wid_values = np.asarray(values[1:], dtype='float32')
- word_embedding_dict[wid_key] = wid_values
- return word_embedding_dict
- ###start
- get_bert_embed(vocabulary_path=vocabulary_path,bert_vec_output_path=bert_vec_output_path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement