Advertisement
Guest User

Untitled

a guest
Sep 20th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.37 KB | None | 0 0
  1. class FastAiRobertaTokenizer(BaseTokenizer):
  2. """Wrapper around RobertaTokenizer to be compatible with fastai"""
  3. def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs):
  4. self._pretrained_tokenizer = tokenizer
  5. self.max_seq_len = max_seq_len
  6. def __call__(self, *args, **kwargs):
  7. return self
  8.  
  9. def tokenizer(self, t:str) -> List[str]:
  10. """Adds Roberta bos and eos tokens and limits the maximum sequence length"""
  11. if config.mark_fields:
  12. sub = 2 # subtraction in total seq_length to be made due to adding spcl tokens
  13. assert "xxfld" in t
  14. t = t.replace("xxfld 1","") # remove the xxfld 1 special token from fastai
  15. # converting fastai field sep token to Roberta
  16. t = re.split(r'xxfld \d+', t)
  17. res = []
  18. for i in range(len(t)-1): # loop over the number of additional fields and the Roberta sep
  19. res += self._pretrained_tokenizer.tokenize(t[i]) + ["</s>", "</s>"]
  20. sub += 2 # increase our subtrations since we added more spcl tokens
  21. res += self._pretrained_tokenizer.tokenize(t[-1]) # add the last sequence
  22. return ["<s>"] + res[:self.max_seq_len - sub] + ["</s>"]
  23.  
  24. res = self._pretrained_tokenizer.tokenize(t)
  25. return ["<s>"] + res[:self.max_seq_len - sub] + ["</s>"]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement