Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pathlib import Path
- from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
- from tokenizers.processors import RobertaProcessing, BertProcessing
- import os
- import torch
- import logging
- from typing import Tuple, List, Dict
- from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
- from torch.utils.data.distributed import DistributedSampler
- from torch.nn.utils.rnn import pad_sequence
- from tqdm import tqdm, trange
- import random
- import numpy as np
- import glob
- import re
- import shutil
- import pickle
- from transformers.src.transformers import (
- WEIGHTS_NAME,
- AdamW,
- BertConfig,
- BertForMaskedLM,
- BertTokenizer,
- CamembertConfig,
- CamembertForMaskedLM,
- CamembertTokenizer,
- DistilBertConfig,
- DistilBertForMaskedLM,
- DistilBertTokenizer,
- GPT2Config,
- GPT2LMHeadModel,
- GPT2Tokenizer,
- OpenAIGPTConfig,
- OpenAIGPTLMHeadModel,
- OpenAIGPTTokenizer,
- PreTrainedTokenizer,
- RobertaConfig,
- RobertaForMaskedLM,
- RobertaTokenizer,
- get_linear_schedule_with_warmup,
- )
- from torch.utils.tensorboard import SummaryWriter
- logger = logging.getLogger(__name__)
- MODEL_CLASSES = {
- "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, ByteLevelBPETokenizer, None),
- "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, ByteLevelBPETokenizer, None),
- "bert": (BertConfig, BertForMaskedLM, BertTokenizer, BertWordPieceTokenizer, BertProcessing),
- "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ByteLevelBPETokenizer, RobertaProcessing),
- "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertWordPieceTokenizer, BertProcessing),
- "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, None, None),
- }
- class TextDataset(Dataset):
- def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
- assert os.path.isfile(file_path)
- block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
- directory, filename = os.path.split(file_path)
- cached_features_file = os.path.join(
- directory, args["model_type"] + "_cached_lm_" + str(block_size) + "_" + filename
- )
- if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
- logger.info("Loading features from cached file %s", cached_features_file)
- with open(cached_features_file, "rb") as handle:
- self.examples = pickle.load(handle)
- else:
- logger.info("Creating features from dataset file at %s", directory)
- self.examples = []
- with open(file_path, encoding="utf-8") as f:
- text = f.read()
- tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
- for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
- self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
- # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
- # If your dataset is small, first you should loook for a bigger one :-) and second you
- # can change this behavior by adding (model specific) padding.
- logger.info("Saving features into cached file %s", cached_features_file)
- with open(cached_features_file, "wb") as handle:
- pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
- def __len__(self):
- return len(self.examples)
- def __getitem__(self, item):
- return torch.tensor(self.examples[item], dtype=torch.long)
- class LineByLineTextDataset(Dataset):
- def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
- assert os.path.isfile(file_path)
- # Here, we do not cache the features, operating under the assumption
- # that we will soon use fast multithreaded tokenizers from the
- # `tokenizers` repo everywhere =)
- logger.info("Creating features from dataset file at %s", file_path)
- with open(file_path, encoding="utf-8") as f:
- lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
- self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
- def __len__(self):
- return len(self.examples)
- def __getitem__(self, i):
- return torch.tensor(self.examples[i], dtype=torch.long)
- class BaseModel(object):
- default_options = {
- "model_type": "roberta",
- "line_by_line": False,
- "should_continue": True,
- "model_name_or_path": None,
- "mlm": True,
- "mlm_probability": 0.15,
- "config_name": None,
- "tokenizer_name": None,
- "cache_dir": None,
- "block_size": -1,
- "evaluate_during_training": False,
- "per_gpu_train_batch_size": 16,
- "per_gpu_eval_batch_size": 16,
- "gradient_accumulation_steps": 4,
- "learning_rate": 5e-5,
- "weight_decay": 0.0,
- "adam_epsilon": 1e-8,
- "max_grad_norm": 1.0,
- "num_train_epochs": 1.0,
- "max_steps": -1,
- "warmup_steps": 0,
- "logging_steps": 5,
- "save_steps": 10,
- "save_total_limit": None,
- "eval_all_checkpoints": False,
- "no_cuda": True,
- "overwrite_output_dir": True,
- "overwrite_cache": False,
- "seed": 42,
- "fp16": False,
- "fp16_opt_level": "01",
- "local_rank": -1,
- "server_ip": "",
- "server_port": ""
- }
- def __init__(self, args: Dict, tokenizer: PreTrainedTokenizer):
- self.args = BaseModel.default_options.copy()
- self.args.update(args)
- if not self.args.get("output_dir"):
- raise ValueError("No output directory specified!")
- if self.args["model_type"] in ["bert", "roberta", "distilbert", "camembert"] and not self.args["mlm"]:
- raise ValueError(
- "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
- "flag (masked language modeling)."
- )
- elif self.args["model_type"] in ["openai-gpt", "gpt2"]:
- self.args["mlm"] = False
- if (
- os.path.exists(self.args["output_dir"])
- and os.listdir(self.args["output_dir"])
- and not self.args["overwrite_output_dir"]
- ):
- raise ValueError(
- "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
- self.args["output_dir"]
- )
- )
- # Setup distant debugging if needed
- if self.args.get("server_ip") and self.args.get("server_port"):
- # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
- import ptvsd
- print("Waiting for debugger attach")
- ptvsd.enable_attach(address=(self.args["server_ip"], self.args["server_port"]), redirect_output=True)
- ptvsd.wait_for_attach()
- # Setup CUDA, GPU & distributed training
- if self.args["local_rank"] == -1 or self.args["no_cuda"]:
- device = torch.device("cuda" if torch.cuda.is_available() and not self.args["no_cuda"] else "cpu")
- self.args["n_gpu"] = torch.cuda.device_count()
- else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
- torch.cuda.set_device(self.args["local_rank"])
- device = torch.device("cuda", self.args["local_rank"])
- torch.distributed.init_process_group(backend="nccl")
- self.args["n_gpu"] = 1
- self.args["device"] = device
- # Setup logging
- logging.basicConfig(
- format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
- datefmt="%m/%d/%Y %H:%M:%S",
- level=logging.INFO if self.args["local_rank"] in [-1, 0] else logging.WARN,
- )
- logger.warning(
- "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
- self.args["local_rank"],
- device,
- self.args["n_gpu"],
- bool(self.args["local_rank"] != -1),
- self.args["fp16"],
- )
- # Set seed
- self._set_seed()
- # Load pretrained model and tokenizer
- if self.args["local_rank"] not in [-1, 0]:
- torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab
- config_class, model_class, tokenizer_class, _, _ = MODEL_CLASSES[self.args["model_type"]]
- if self.args.get("config_name"):
- config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"])
- elif self.args.get("model_name_or_path"):
- config = config_class.from_pretrained(self.args["model_name_or_path"], cache_dir=self.args["cache_dir"])
- else:
- config = config_class()
- if self.args["block_size"] <= 0:
- self.args["block_size"] = tokenizer.max_len
- # Our input block size will be the max possible for the model
- else:
- self.args["block_size"] = min(self.args["block_size"], tokenizer.max_len)
- if self.args.get("model_name_or_path"):
- model = model_class.from_pretrained(
- self.args["model_name_or_path"],
- from_tf=bool(".ckpt" in self.args["model_name_or_path"]),
- config=config,
- cache_dir=self.args["cache_dir"],
- )
- else:
- logger.info("New model needs training from scratch")
- model = model_class(config=config)
- model.to(self.args["device"])
- self.model = model
- self.model_class = model_class
- self.tokenizer = tokenizer
- self.tokenizer_class = tokenizer_class
- def train(self, train_dataset : Dataset, eval_dataset : Dataset = None):
- if self.args["local_rank"] == 0:
- torch.distributed.barrier() # End of barrier to make sure only the first process in distributed training download model & vocab
- logger.info("Training/evaluation parameters %s", self.args)
- # Training
- if self.args["local_rank"] not in [-1, 0]:
- torch.distributed.barrier() # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
- # train_dataset = self._load_and_cache_examples(self.tokenizer, trainfile)
- if self.args["local_rank"] == 0:
- torch.distributed.barrier()
- global_step, tr_loss = self._train(train_dataset, eval_dataset)
- logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
- # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
- if (self.args["local_rank"] == -1 or torch.distributed.get_rank() == 0):
- # Create output directory if needed
- if self.args["local_rank"] in [-1, 0]:
- os.makedirs(self.args["output_dir"], exist_ok=True)
- logger.info("Saving model checkpoint to %s", self.args["output_dir"])
- # Save a trained model, configuration and tokenizer using `save_pretrained()`.
- # They can then be reloaded using `from_pretrained()`
- model_to_save = (
- self.model.module if hasattr(self.model, "module") else self.model
- ) # Take care of distributed/parallel training
- model_to_save.save_pretrained(self.args["output_dir"])
- self.tokenizer.save_pretrained(self.args["output_dir"])
- # Good practice: save your training arguments together with the trained model
- torch.save(self.args, os.path.join(self.args["output_dir"], "training_args.bin"))
- # Load a trained model and vocabulary that you have fine-tuned
- self.model = self.model_class.from_pretrained(self.args["output_dir"])
- self.tokenizer = self.tokenizer_class.from_pretrained(self.args["output_dir"])
- self.model.to(self.args["device"])
- # Evaluation
- if eval_dataset and self.args["local_rank"] in [-1, 0]:
- self.evaluate(eval_dataset)
- def evaluate(self, eval_dataset : Dataset):
- results = {}
- checkpoints = [self.args["output_dir"]]
- if self.args["eval_all_checkpoints"]:
- checkpoints = list(
- os.path.dirname(c) for c in sorted(glob.glob(self.args["output_dir"] + "/**/" + WEIGHTS_NAME, recursive=True))
- )
- logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
- logger.info("Evaluate the following checkpoints: %s", checkpoints)
- for checkpoint in checkpoints:
- global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
- prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
- model = self.model_class.from_pretrained(checkpoint)
- model.to(self.args["device"])
- result = self._evaluate(eval_dataset, prefix=prefix)
- result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
- results.update(result)
- return results
- def _load_and_cache_examples(self, tokenizer, file_path:str):
- if self.args.get("line_by_line"):
- return LineByLineTextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])
- else:
- return TextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])
- def _sorted_checkpoints(self, output_dir: str, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
- ordering_and_checkpoint_path = []
- glob_checkpoints = glob.glob(os.path.join(output_dir, "{}-*".format(checkpoint_prefix)))
- for path in glob_checkpoints:
- if use_mtime:
- ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
- else:
- regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
- if regex_match and regex_match.groups():
- ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
- checkpoints_sorted = sorted(ordering_and_checkpoint_path)
- checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
- return checkpoints_sorted
- def _rotate_checkpoints(self, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
- if not self.args.get("save_total_limit"):
- return
- if self.args["save_total_limit"] <= 0:
- return
- # Check if we should delete older checkpoint(s)
- checkpoints_sorted = self._sorted_checkpoints(self.args["output_dir"], checkpoint_prefix, use_mtime)
- if len(checkpoints_sorted) <= self.args["save_total_limit"]:
- return
- number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args["save_total_limit"])
- checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
- for checkpoint in checkpoints_to_be_deleted:
- logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
- shutil.rmtree(checkpoint)
- def _mask_tokens(self, inputs: torch.Tensor, tokenizer: PreTrainedTokenizer) -> Tuple[torch.Tensor, torch.Tensor]:
- """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
- if tokenizer.mask_token is None:
- raise ValueError(
- "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
- )
- labels = inputs.clone()
- # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
- probability_matrix = torch.full(labels.shape, self.args["mlm_probability"])
- special_tokens_mask = [
- tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
- ]
- probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
- if tokenizer._pad_token is not None:
- padding_mask = labels.eq(tokenizer.pad_token_id)
- probability_matrix.masked_fill_(padding_mask, value=0.0)
- masked_indices = torch.bernoulli(probability_matrix).bool()
- labels[~masked_indices] = -100 # We only compute loss on masked tokens
- # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
- indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
- inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
- # 10% of the time, we replace masked input tokens with random word
- indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
- random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
- inputs[indices_random] = random_words[indices_random]
- # The rest of the time (10% of the time) we keep the masked input tokens unchanged
- return inputs, labels
- def _set_seed(self):
- random.seed(self.args["seed"])
- np.random.seed(self.args["seed"])
- torch.manual_seed(self.args["seed"])
- if self.args["n_gpu"] > 0:
- torch.cuda.manual_seed_all(self.args["seed"])
- # def _train_tokenizer(self, vocab_size : int, min_frequency : int):
- # print("Training " + self.filename)
- # tokenizer = ByteLevelBPETokenizer()
- # tokenizer.train(files=[self.filename], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
- # "<s>", "<pad>", "</s>", "<unk>", "<mask>"
- # ])
- # tokenizer.save(".", self.filename+".vocab")
- # print("Done.")
- # return tokenizer
- #
- # def _getTokenizer(self, vocab_size : int = 7000, min_frequency : int = 5):
- # if (os.path.exists(self.filename+".vocab-merges.txt") & os.path.exists(self.filename+".vocab-vocab.json")):
- # tokenizer = ByteLevelBPETokenizer(self.filename+".vocab-vocab.json", self.filename+".vocab-merges.txt")
- # else:
- # tokenizer = self._train_tokenizer(vocab_size, min_frequency)
- # tokenizer._tokenizer.post_processor = BertProcessing(
- # ("</s>", tokenizer.token_to_id("</s>")),
- # ("<s>", tokenizer.token_to_id("<s>")),
- # )
- # tokenizer.enable_truncation(max_length=512)
- # return tokenizer
- def _train(self, train_dataset : Dataset, eval_dataset: Dataset = None) -> Tuple[int, float]:
- """ Train the model """
- if self.args["local_rank"] in [-1, 0]:
- tb_writer = SummaryWriter()
- self.args["train_batch_size"] = self.args["per_gpu_train_batch_size"] * max(1, self.args["n_gpu"])
- def collate(examples: List[torch.Tensor]):
- if self.tokenizer._pad_token is None:
- return pad_sequence(examples, batch_first=True)
- return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
- train_sampler = RandomSampler(train_dataset) if self.args["local_rank"] == -1 else DistributedSampler(train_dataset)
- train_dataloader = DataLoader(
- train_dataset, sampler=train_sampler, batch_size=self.args["train_batch_size"], collate_fn=collate
- )
- if self.args["max_steps"] > 0:
- t_total = self.args["max_steps"]
- self.args["num_train_epochs"] = self.args["max_steps"] // (len(train_dataloader) // self.args["gradient_accumulation_steps"]) + 1
- else:
- t_total = len(train_dataloader) // self.args["gradient_accumulation_steps"] * self.args["num_train_epochs"]
- # Prepare optimizer and schedule (linear warmup and decay)
- no_decay = ["bias", "LayerNorm.weight"]
- optimizer_grouped_parameters = [
- {
- "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
- "weight_decay": self.args["weight_decay"],
- },
- {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=self.args["learning_rate"], eps=self.args["adam_epsilon"])
- scheduler = get_linear_schedule_with_warmup(
- optimizer, num_warmup_steps=self.args["warmup_steps"], num_training_steps=t_total
- )
- # Check if saved optimizer or scheduler states exist
- if (
- self.args.get("model_name_or_path")
- and os.path.isfile(os.path.join(self.args["model_name_or_path"], "optimizer.pt"))
- and os.path.isfile(os.path.join(self.args["model_name_or_path"], "scheduler.pt"))
- ):
- # Load in optimizer and scheduler states
- optimizer.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "optimizer.pt")))
- scheduler.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "scheduler.pt")))
- if self.args.get("fp16"):
- try:
- from apex import amp
- except ImportError:
- raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
- self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=self.args["fp16_opt_level"])
- # multi-gpu training (should be after apex fp16 initialization)
- if self.args["n_gpu"] > 1:
- self.model = torch.nn.DataParallel(self.model)
- # Distributed training (should be after apex fp16 initialization)
- if self.args["local_rank"] != -1:
- self.model = torch.nn.parallel.DistributedDataParallel(
- self.model, device_ids=[self.args["local_rank"]], output_device=self.args["local_rank"], find_unused_parameters=True
- )
- # Train!
- logger.info("***** Running training *****")
- logger.info(" Num examples = %d", len(train_dataset))
- logger.info(" Num Epochs = %d", self.args["num_train_epochs"])
- logger.info(" Instantaneous batch size per GPU = %d", self.args["per_gpu_train_batch_size"])
- logger.info(
- " Total train batch size (w. parallel, distributed & accumulation) = %d",
- self.args["train_batch_size"]
- * self.args["gradient_accumulation_steps"]
- * (torch.distributed.get_world_size() if self.args["local_rank"] != -1 else 1),
- )
- logger.info(" Gradient Accumulation steps = %d", self.args["gradient_accumulation_steps"])
- logger.info(" Total optimization steps = %d", t_total)
- global_step = 0
- epochs_trained = 0
- steps_trained_in_current_epoch = 0
- # Check if continuing training from a checkpoint
- if self.args["model_name_or_path"] and os.path.exists(self.args["model_name_or_path"]):
- try:
- # set global_step to gobal_step of last saved checkpoint from model path
- checkpoint_suffix = self.args["model_name_or_path"].split("-")[-1].split("/")[0]
- global_step = int(checkpoint_suffix)
- epochs_trained = global_step // (len(train_dataloader) // self.args["gradient_accumulation_steps"])
- steps_trained_in_current_epoch = global_step % (len(train_dataloader) // self.args["gradient_accumulation_steps"])
- logger.info(" Continuing training from checkpoint, will skip to saved global_step")
- logger.info(" Continuing training from epoch %d", epochs_trained)
- logger.info(" Continuing training from global step %d", global_step)
- logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
- except ValueError:
- logger.info(" Starting fine-tuning.")
- tr_loss, logging_loss = 0.0, 0.0
- model_to_resize = self.model.module if hasattr(self.model, "module") else self.model # Take care of distributed/parallel training
- model_to_resize.resize_token_embeddings(len(self.tokenizer))
- self.model.zero_grad()
- train_iterator = trange(
- epochs_trained, int(self.args["num_train_epochs"]), desc="Epoch", disable=self.args["local_rank"] not in [-1, 0]
- )
- self._set_seed() # Added here for reproducibility
- for _ in train_iterator:
- epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args["local_rank"] not in [-1, 0])
- for step, batch in enumerate(epoch_iterator):
- # Skip past any already trained steps if resuming training
- if steps_trained_in_current_epoch > 0:
- steps_trained_in_current_epoch -= 1
- continue
- inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
- inputs = inputs.to(self.args["device"])
- labels = labels.to(self.args["device"])
- self.model.train()
- outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
- loss = outputs[0] # model outputs are always tuple in transformers (see doc)
- if self.args["n_gpu"] > 1:
- loss = loss.mean() # mean() to average on multi-gpu parallel training
- if self.args["gradient_accumulation_steps"] > 1:
- loss = loss / self.args["gradient_accumulation_steps"]
- if self.args.get("fp16"):
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
- else:
- loss.backward()
- tr_loss += loss.item()
- if (step + 1) % self.args["gradient_accumulation_steps"] == 0:
- if self.args.get("fp16"):
- torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args["max_grad_norm"])
- else:
- torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args["max_grad_norm"])
- optimizer.step()
- scheduler.step() # Update learning rate schedule
- self.model.zero_grad()
- global_step += 1
- if self.args["local_rank"] in [-1, 0] and self.args["logging_steps"] > 0 and global_step % self.args["logging_steps"] == 0:
- # Log metrics
- if (
- self.args["local_rank"] == -1 and eval_dataset
- ): # Only evaluate when single GPU otherwise metrics may not average well
- results = self.evaluate(eval_dataset)
- for key, value in results.items():
- tb_writer.add_scalar("eval_{}".format(key), value, global_step)
- tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
- tb_writer.add_scalar("loss", (tr_loss - logging_loss) / self.args["logging_steps"], global_step)
- logging_loss = tr_loss
- if self.args["local_rank"] in [-1, 0] and self.args["save_steps"] > 0 and global_step % self.args["save_steps"] == 0:
- checkpoint_prefix = "checkpoint"
- # Save model checkpoint
- output_dir = os.path.join(self.args["output_dir"], "{}-{}".format(checkpoint_prefix, global_step))
- os.makedirs(output_dir, exist_ok=True)
- model_to_save = (
- self.model.module if hasattr(self.model, "module") else self.model
- ) # Take care of distributed/parallel training
- model_to_save.save_pretrained(output_dir)
- self.tokenizer.save_pretrained(output_dir)
- torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
- logger.info("Saving model checkpoint to %s", output_dir)
- self._rotate_checkpoints(checkpoint_prefix)
- torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
- torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
- logger.info("Saving optimizer and scheduler states to %s", output_dir)
- if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
- epoch_iterator.close()
- break
- if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
- train_iterator.close()
- break
- if self.args["local_rank"] in [-1, 0]:
- tb_writer.close()
- return global_step, tr_loss / global_step
- def _evaluate(self, dataset: Dataset, prefix="") -> Dict:
- # Loop to handle MNLI double evaluation (matched, mis-matched)
- eval_output_dir = self.args["output_dir"]
- # eval_dataset = self._load_and_cache_examples(self.tokenizer, file)
- if self.args["local_rank"] in [-1, 0]:
- os.makedirs(eval_output_dir, exist_ok=True)
- self.args["eval_batch_size"] = self.args["per_gpu_eval_batch_size"] * max(1, self.args["n_gpu"])
- # Note that DistributedSampler samples randomly
- def collate(examples: List[torch.Tensor]):
- if self.tokenizer._pad_token is None:
- return pad_sequence(examples, batch_first=True)
- return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
- eval_sampler = SequentialSampler(dataset)
- eval_dataloader = DataLoader(
- dataset, sampler=eval_sampler, batch_size=self.args["eval_batch_size"], collate_fn=collate
- )
- # multi-gpu evaluate
- if self.args["n_gpu"] > 1:
- self.model = torch.nn.DataParallel(self.model)
- # Eval!
- logger.info("***** Running evaluation {} *****".format(prefix))
- logger.info(" Num examples = %d", len(dataset))
- logger.info(" Batch size = %d", self.args["eval_batch_size"])
- eval_loss = 0.0
- nb_eval_steps = 0
- self.model.eval()
- for batch in tqdm(eval_dataloader, desc="Evaluating"):
- inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
- inputs = inputs.to(self.args["device"])
- labels = labels.to(self.args["device"])
- with torch.no_grad():
- outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
- lm_loss = outputs[0]
- eval_loss += lm_loss.mean().item()
- nb_eval_steps += 1
- eval_loss = eval_loss / nb_eval_steps
- perplexity = torch.exp(torch.tensor(eval_loss))
- result = {"perplexity": perplexity}
- output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
- with open(output_eval_file, "w") as writer:
- logger.info("***** Eval results {} *****".format(prefix))
- for key in sorted(result.keys()):
- logger.info(" %s = %s", key, str(result[key]))
- writer.write("%s = %s\n" % (key, str(result[key])))
- return result
- # ----------------------------------------------------------------------------------------------------------------------
- class ModularModel(BaseModel):
- def __init__(self,
- output_dir: str,
- use_tokenizer : PreTrainedTokenizer = None,
- model_type: str = "roberta",
- cache_dir : str = None,
- args=None
- ):
- if cache_dir is None:
- cache_dir = output_dir + "/cache"
- if args is None:
- args = {}
- sorted_checkpoints = self._sorted_checkpoints(output_dir)
- model_name_or_path = None
- if len(sorted_checkpoints) != 0:
- # raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
- # else:
- model_name_or_path = sorted_checkpoints[-1]
- _, _, tokenizer_class, _, _ = MODEL_CLASSES[model_type]
- if model_name_or_path:
- tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
- elif Path(output_dir + "/tokenizer").exists():
- tokenizer = tokenizer_class.from_pretrained(output_dir + "/tokenizer", cache_dir=cache_dir)
- elif use_tokenizer:
- tokenizer = use_tokenizer
- else:
- raise ValueError(
- "No tokenizer provided, and no pretrained tokenizer available in " + output_dir + "/tokenizer :("
- )
- args.update({
- "model_type": model_type,
- "model_name_or_path": model_name_or_path,
- "output_dir": output_dir + "/out",
- "cache_dir": cache_dir
- })
- super().__init__(args, tokenizer)
- def trainTokenizer(output_dir: str, file: str, model_type : str = "roberta", vocab_size: int = 500, min_frequency: int = 50):
- _, _, tokenizer_class, base_tokenizer_class, processor = MODEL_CLASSES[model_type]
- if base_tokenizer_class is ByteLevelBPETokenizer:
- tokenizer = base_tokenizer_class()
- tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
- "<s>",
- "<pad>",
- "</s>",
- "<unk>",
- "<mask>"
- ])
- if processor:
- tokenizer._tokenizer.post_processor = processor(
- ("</s>", tokenizer.token_to_id("</s>")),
- ("<s>", tokenizer.token_to_id("<s>")),
- )
- elif base_tokenizer_class is BertWordPieceTokenizer:
- tokenizer = base_tokenizer_class()
- tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency)
- tokenizer._tokenizer.post_processor = processor(
- ("[SEP]", tokenizer.token_to_id("[SEP]")),
- ("[CLS]", tokenizer.token_to_id("[CLS]")),
- )
- else:
- print("???")
- tokenizer.enable_truncation(max_length=512)
- if not os.path.exists(output_dir + "/tokenizer"):
- os.makedirs(output_dir + "/tokenizer")
- tokenizer.save(output_dir + "/tokenizer", "")
- filenames = [*tokenizer_class.vocab_files_names.values()]
- for f in filenames:
- os.rename(output_dir + "/tokenizer/-" + f, output_dir + "/tokenizer/" + f)
- return tokenizer_class.from_pretrained(output_dir + "/tokenizer/")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement