SHARE
TWEET

Untitled

a guest Sep 21st, 2019 99 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
  2.     tokens = ['[CLS]']
  3.     tokens.extend(tokenizer.tokenize(sentence))
  4.     if len(tokens) > max_seq_len-1:
  5.         tokens = tokens[:max_seq_len-1]
  6.     tokens.append('[SEP]')
  7.    
  8.     segment_ids = [0] * len(tokens)
  9.     input_ids = tokenizer.convert_tokens_to_ids(tokens)
  10.     input_mask = [1] * len(input_ids)
  11.  
  12.     #Zero Mask till seq_length
  13.     zero_mask = [0] * (max_seq_len-len(tokens))
  14.     input_ids.extend(zero_mask)
  15.     input_mask.extend(zero_mask)
  16.     segment_ids.extend(zero_mask)
  17.    
  18.     return input_ids, input_mask, segment_ids
  19.  
  20. def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
  21.     all_input_ids = []
  22.     all_input_mask = []
  23.     all_segment_ids = []
  24.    
  25.     for sentence in sentences:
  26.         input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
  27.         all_input_ids.append(input_ids)
  28.         all_input_mask.append(input_mask)
  29.         all_segment_ids.append(segment_ids)
  30.    
  31.     return all_input_ids, all_input_mask, all_segment_ids
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top