Untitled

import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm
from tensorflow.keras import backend as K


"""
copy https://github.com/deepset-ai/bert-tensorflow/blob/master/samples/vocab.txt
"""


def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len - 1:
        tokens = tokens[:max_seq_len - 1]
    tokens.append('[SEP]')

    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    # Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len - len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)

    return input_ids, input_mask, segment_ids


def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []

    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)

    return all_input_ids, all_input_mask, all_segment_ids


class Stub_Feature_Extractor:

    def __init__(self):
        self.BERT_PATH = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
        self.bert = hub.Module(self.BERT_PATH)
        self.tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=False)
        self.sess = tf.Session()

        self.sess.run(tf.global_variables_initializer())

        self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
        self.input_mask = tf.placeholder(dtype=tf.int32, shape=[None, None])
        self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])

        self.bert_inputs = dict(
            input_ids=self.input_ids,
            input_mask=self.input_mask,
            segment_ids=self.segment_ids)

        self.bert_outputs = self.bert(self.bert_inputs, signature="tokens", as_dict=True)

    def get_features(self, sentences):
        input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)

        out = self.sess.run(self.bert_outputs,
                       feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
                                  self.segment_ids: segment_ids_vals})

        # out has two keys `dict_keys(['sequence_output', 'pooled_output'])`
        input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)

        out = self.sess.run(self.bert_outputs,
                       feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
                                  self.segment_ids: segment_ids_vals})

        return out["sequence_output"]


if __name__ == "__main__":
    feature_extractor = Stub_Feature_Extractor()
    sentences = ['I prefer Python over Java', 'I like coding in Python', 'coding is fun']
    print(feature_extractor.get_features(sentences))