Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class FastAiRobertaTokenizer(BaseTokenizer):
- """Wrapper around RobertaTokenizer to be compatible with fastai"""
- def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs):
- self._pretrained_tokenizer = tokenizer
- self.max_seq_len = max_seq_len
- def __call__(self, *args, **kwargs):
- return self
- def tokenizer(self, t:str) -> List[str]:
- """Adds Roberta bos and eos tokens and limits the maximum sequence length"""
- if config.mark_fields:
- sub = 2 # subtraction in total seq_length to be made due to adding spcl tokens
- assert "xxfld" in t
- t = t.replace("xxfld 1","") # remove the xxfld 1 special token from fastai
- # converting fastai field sep token to Roberta
- t = re.split(r'xxfld \d+', t)
- res = []
- for i in range(len(t)-1): # loop over the number of additional fields and the Roberta sep
- res += self._pretrained_tokenizer.tokenize(t[i]) + ["</s>", "</s>"]
- sub += 2 # increase our subtrations since we added more spcl tokens
- res += self._pretrained_tokenizer.tokenize(t[-1]) # add the last sequence
- return ["<s>"] + res[:self.max_seq_len - sub] + ["</s>"]
- res = self._pretrained_tokenizer.tokenize(t)
- return ["<s>"] + res[:self.max_seq_len - sub] + ["</s>"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement