Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2020
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.25 KB | None | 0 0
  1. from huggingface import ModularModel,trainTokenizer
  2. from torch.utils.data import Dataset
  3. from transformers.src.transformers import PreTrainedTokenizer
  4. import torch
  5. from pathlib import Path
  6.  
  7. class MyDataset(Dataset):
  8.     def __init__(self, filename : str, tokenizer: PreTrainedTokenizer, block_size: int = 512):
  9.         # self.examples = []
  10.         block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
  11.         # this example uses a bunch of paragraphs separated by empty lines
  12.         lines = Path(filename).read_text(encoding="utf-8").split("\n\n")
  13.         uselines = [x for x in lines if x != ""]
  14.         self.examples = tokenizer.batch_encode_plus(uselines, add_special_tokens=True, max_length=block_size)["input_ids"]
  15.         self.tokenizer = tokenizer
  16.  
  17.     def __len__(self):
  18.         return len(self.examples)
  19.  
  20.     def __getitem__(self, item):
  21.         return torch.tensor(self.examples[item])
  22.  
  23. tokenizer = trainTokenizer("./out", "train_data.txt", "roberta")
  24. model = ModularModel("./out", tokenizer, model_type="roberta")
  25.  
  26. # One a tokenizer is trained, you can use this one instead:
  27. # model = ModularModel("./out", args= {})
  28.  
  29. dataset = MyDataset("train_data.txt", model.tokenizer)
  30.  
  31. model.train(train_dataset, eval_dataset=None)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement