Advertisement
Guest User

Untitled

a guest
Jun 27th, 2019
210
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.64 KB | None | 0 0
  1. import json
  2. from pytorch_pretrained_bert import cached_path
  3.  
  4. url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json"
  5.  
  6. # Download and load JSON dataset
  7. personachat_file = cached_path(url)
  8. with open(personachat_file, "r", encoding="utf-8") as f:
  9. dataset = json.loads(f.read())
  10.  
  11. # Tokenize and encode the dataset using our loaded GPT tokenizer
  12. def tokenize(obj):
  13. if isinstance(obj, str):
  14. return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
  15. if isinstance(obj, dict):
  16. return dict((n, tokenize(o)) for n, o in obj.items())
  17. return list(tokenize(o) for o in obj)
  18.  
  19. dataset = tokenize(dataset)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement