Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import tensorflow as tf
- import pandas as pd
- import tensorflow_hub as hub
- import os
- import re
- import numpy as np
- from bert.tokenization import FullTokenizer
- from tqdm import tqdm
- from tensorflow.keras import backend as K
- """
- copy https://github.com/deepset-ai/bert-tensorflow/blob/master/samples/vocab.txt
- """
- def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
- tokens = ['[CLS]']
- tokens.extend(tokenizer.tokenize(sentence))
- if len(tokens) > max_seq_len - 1:
- tokens = tokens[:max_seq_len - 1]
- tokens.append('[SEP]')
- segment_ids = [0] * len(tokens)
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_mask = [1] * len(input_ids)
- # Zero Mask till seq_length
- zero_mask = [0] * (max_seq_len - len(tokens))
- input_ids.extend(zero_mask)
- input_mask.extend(zero_mask)
- segment_ids.extend(zero_mask)
- return input_ids, input_mask, segment_ids
- def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
- all_input_ids = []
- all_input_mask = []
- all_segment_ids = []
- for sentence in sentences:
- input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
- all_input_ids.append(input_ids)
- all_input_mask.append(input_mask)
- all_segment_ids.append(segment_ids)
- return all_input_ids, all_input_mask, all_segment_ids
- class Stub_Feature_Extractor:
- def __init__(self):
- self.BERT_PATH = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
- self.bert = hub.Module(self.BERT_PATH)
- self.tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=False)
- self.sess = tf.Session()
- self.sess.run(tf.global_variables_initializer())
- self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
- self.input_mask = tf.placeholder(dtype=tf.int32, shape=[None, None])
- self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
- self.bert_inputs = dict(
- input_ids=self.input_ids,
- input_mask=self.input_mask,
- segment_ids=self.segment_ids)
- self.bert_outputs = self.bert(self.bert_inputs, signature="tokens", as_dict=True)
- def get_features(self, sentences):
- input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)
- out = self.sess.run(self.bert_outputs,
- feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
- self.segment_ids: segment_ids_vals})
- # out has two keys `dict_keys(['sequence_output', 'pooled_output'])`
- input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)
- out = self.sess.run(self.bert_outputs,
- feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
- self.segment_ids: segment_ids_vals})
- return out["sequence_output"]
- if __name__ == "__main__":
- feature_extractor = Stub_Feature_Extractor()
- sentences = ['I prefer Python over Java', 'I like coding in Python', 'coding is fun']
- print(feature_extractor.get_features(sentences))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement