Advertisement
Guest User

Untitled

a guest
Feb 26th, 2020
162
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.30 KB | None | 0 0
  1. import tensorflow as tf
  2. import pandas as pd
  3. import tensorflow_hub as hub
  4. import os
  5. import re
  6. import numpy as np
  7. from bert.tokenization import FullTokenizer
  8. from tqdm import tqdm
  9. from tensorflow.keras import backend as K
  10.  
  11.  
  12. """
  13. copy https://github.com/deepset-ai/bert-tensorflow/blob/master/samples/vocab.txt
  14. """
  15.  
  16.  
  17. def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
  18. tokens = ['[CLS]']
  19. tokens.extend(tokenizer.tokenize(sentence))
  20. if len(tokens) > max_seq_len - 1:
  21. tokens = tokens[:max_seq_len - 1]
  22. tokens.append('[SEP]')
  23.  
  24. segment_ids = [0] * len(tokens)
  25. input_ids = tokenizer.convert_tokens_to_ids(tokens)
  26. input_mask = [1] * len(input_ids)
  27.  
  28. # Zero Mask till seq_length
  29. zero_mask = [0] * (max_seq_len - len(tokens))
  30. input_ids.extend(zero_mask)
  31. input_mask.extend(zero_mask)
  32. segment_ids.extend(zero_mask)
  33.  
  34. return input_ids, input_mask, segment_ids
  35.  
  36.  
  37. def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
  38. all_input_ids = []
  39. all_input_mask = []
  40. all_segment_ids = []
  41.  
  42. for sentence in sentences:
  43. input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
  44. all_input_ids.append(input_ids)
  45. all_input_mask.append(input_mask)
  46. all_segment_ids.append(segment_ids)
  47.  
  48. return all_input_ids, all_input_mask, all_segment_ids
  49.  
  50.  
  51. class Stub_Feature_Extractor:
  52.  
  53. def __init__(self):
  54. self.BERT_PATH = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
  55. self.bert = hub.Module(self.BERT_PATH)
  56. self.tokenizer = FullTokenizer(vocab_file="vocab.txt", do_lower_case=False)
  57. self.sess = tf.Session()
  58.  
  59. self.sess.run(tf.global_variables_initializer())
  60.  
  61. self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
  62. self.input_mask = tf.placeholder(dtype=tf.int32, shape=[None, None])
  63. self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None])
  64.  
  65. self.bert_inputs = dict(
  66. input_ids=self.input_ids,
  67. input_mask=self.input_mask,
  68. segment_ids=self.segment_ids)
  69.  
  70. self.bert_outputs = self.bert(self.bert_inputs, signature="tokens", as_dict=True)
  71.  
  72. def get_features(self, sentences):
  73. input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)
  74.  
  75. out = self.sess.run(self.bert_outputs,
  76. feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
  77. self.segment_ids: segment_ids_vals})
  78.  
  79. # out has two keys `dict_keys(['sequence_output', 'pooled_output'])`
  80. input_ids_vals, input_mask_vals, segment_ids_vals = convert_sentences_to_features(sentences, self.tokenizer, 20)
  81.  
  82. out = self.sess.run(self.bert_outputs,
  83. feed_dict={self.input_ids: input_ids_vals, self.input_mask: input_mask_vals,
  84. self.segment_ids: segment_ids_vals})
  85.  
  86. return out["sequence_output"]
  87.  
  88.  
  89. if __name__ == "__main__":
  90. feature_extractor = Stub_Feature_Extractor()
  91. sentences = ['I prefer Python over Java', 'I like coding in Python', 'coding is fun']
  92. print(feature_extractor.get_features(sentences))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement