Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from huggingface import ModularModel,trainTokenizer
- from torch.utils.data import Dataset
- from transformers.src.transformers import PreTrainedTokenizer
- import torch
- from pathlib import Path
- class MyDataset(Dataset):
- def __init__(self, filename : str, tokenizer: PreTrainedTokenizer, block_size: int = 512):
- # self.examples = []
- block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
- # this example uses a bunch of paragraphs separated by empty lines
- lines = Path(filename).read_text(encoding="utf-8").split("\n\n")
- uselines = [x for x in lines if x != ""]
- self.examples = tokenizer.batch_encode_plus(uselines, add_special_tokens=True, max_length=block_size)["input_ids"]
- self.tokenizer = tokenizer
- def __len__(self):
- return len(self.examples)
- def __getitem__(self, item):
- return torch.tensor(self.examples[item])
- tokenizer = trainTokenizer("./out", "train_data.txt", "roberta")
- model = ModularModel("./out", tokenizer, model_type="roberta")
- # One a tokenizer is trained, you can use this one instead:
- # model = ModularModel("./out", args= {})
- dataset = MyDataset("train_data.txt", model.tokenizer)
- model.train(train_dataset, eval_dataset=None)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement