Advertisement
Guest User

Untitled

a guest
Sep 21st, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.06 KB | None | 0 0
  1. def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
  2. tokens = ['[CLS]']
  3. tokens.extend(tokenizer.tokenize(sentence))
  4. if len(tokens) > max_seq_len-1:
  5. tokens = tokens[:max_seq_len-1]
  6. tokens.append('[SEP]')
  7.  
  8. segment_ids = [0] * len(tokens)
  9. input_ids = tokenizer.convert_tokens_to_ids(tokens)
  10. input_mask = [1] * len(input_ids)
  11.  
  12. #Zero Mask till seq_length
  13. zero_mask = [0] * (max_seq_len-len(tokens))
  14. input_ids.extend(zero_mask)
  15. input_mask.extend(zero_mask)
  16. segment_ids.extend(zero_mask)
  17.  
  18. return input_ids, input_mask, segment_ids
  19.  
  20. def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
  21. all_input_ids = []
  22. all_input_mask = []
  23. all_segment_ids = []
  24.  
  25. for sentence in sentences:
  26. input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
  27. all_input_ids.append(input_ids)
  28. all_input_mask.append(input_mask)
  29. all_segment_ids.append(segment_ids)
  30.  
  31. return all_input_ids, all_input_mask, all_segment_ids
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement