Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
- tokens = ['[CLS]']
- tokens.extend(tokenizer.tokenize(sentence))
- if len(tokens) > max_seq_len-1:
- tokens = tokens[:max_seq_len-1]
- tokens.append('[SEP]')
- segment_ids = [0] * len(tokens)
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
- input_mask = [1] * len(input_ids)
- #Zero Mask till seq_length
- zero_mask = [0] * (max_seq_len-len(tokens))
- input_ids.extend(zero_mask)
- input_mask.extend(zero_mask)
- segment_ids.extend(zero_mask)
- return input_ids, input_mask, segment_ids
- def convert_sentences_to_features(sentences, tokenizer, max_seq_len=20):
- all_input_ids = []
- all_input_mask = []
- all_segment_ids = []
- for sentence in sentences:
- input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
- all_input_ids.append(input_ids)
- all_input_mask.append(input_mask)
- all_segment_ids.append(segment_ids)
- return all_input_ids, all_input_mask, all_segment_ids
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement