Untitled

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import RobertaProcessing, BertProcessing
import os
import torch
import logging
from typing import Tuple, List, Dict
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm, trange
import random
import numpy as np
import glob
import re
import shutil
import pickle

from transformers.src.transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForMaskedLM,
    BertTokenizer,
    CamembertConfig,
    CamembertForMaskedLM,
    CamembertTokenizer,
    DistilBertConfig,
    DistilBertForMaskedLM,
    DistilBertTokenizer,
    GPT2Config,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    OpenAIGPTConfig,
    OpenAIGPTLMHeadModel,
    OpenAIGPTTokenizer,
    PreTrainedTokenizer,
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaTokenizer,
    get_linear_schedule_with_warmup,
)
from torch.utils.tensorboard import SummaryWriter

logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, ByteLevelBPETokenizer, None),
    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, ByteLevelBPETokenizer, None),
    "bert": (BertConfig, BertForMaskedLM, BertTokenizer, BertWordPieceTokenizer, BertProcessing),
    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ByteLevelBPETokenizer, RobertaProcessing),
    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertWordPieceTokenizer, BertProcessing),
    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, None, None),
}

class TextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)

        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args["model_type"] + "_cached_lm_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info("Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


class BaseModel(object):
    default_options = {
        "model_type": "roberta",
        "line_by_line": False,
        "should_continue": True,
        "model_name_or_path": None,
        "mlm": True,
        "mlm_probability": 0.15,
        "config_name": None,
        "tokenizer_name": None,
        "cache_dir": None,
        "block_size": -1,
        "evaluate_during_training": False,
        "per_gpu_train_batch_size": 16,
        "per_gpu_eval_batch_size": 16,
        "gradient_accumulation_steps": 4,
        "learning_rate": 5e-5,
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "max_grad_norm": 1.0,
        "num_train_epochs": 1.0,
        "max_steps": -1,
        "warmup_steps": 0,
        "logging_steps": 5,
        "save_steps": 10,
        "save_total_limit": None,
        "eval_all_checkpoints": False,
        "no_cuda": True,
        "overwrite_output_dir": True,
        "overwrite_cache": False,
        "seed": 42,
        "fp16": False,
        "fp16_opt_level": "01",
        "local_rank": -1,
        "server_ip": "",
        "server_port": ""
    }

    def __init__(self, args: Dict, tokenizer: PreTrainedTokenizer):
        self.args = BaseModel.default_options.copy()
        self.args.update(args)

        if not self.args.get("output_dir"):
            raise ValueError("No output directory specified!")

        if self.args["model_type"] in ["bert", "roberta", "distilbert", "camembert"] and not self.args["mlm"]:
            raise ValueError(
                "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
                "flag (masked language modeling)."
            )
        elif self.args["model_type"] in ["openai-gpt", "gpt2"]:
            self.args["mlm"] = False

        if (
                os.path.exists(self.args["output_dir"])
                and os.listdir(self.args["output_dir"])
                and not self.args["overwrite_output_dir"]
        ):
            raise ValueError(
                "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                    self.args["output_dir"]
                )
            )

        # Setup distant debugging if needed
        if self.args.get("server_ip") and self.args.get("server_port"):
            # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
            import ptvsd

            print("Waiting for debugger attach")
            ptvsd.enable_attach(address=(self.args["server_ip"], self.args["server_port"]), redirect_output=True)
            ptvsd.wait_for_attach()

        # Setup CUDA, GPU & distributed training
        if self.args["local_rank"] == -1 or self.args["no_cuda"]:
            device = torch.device("cuda" if torch.cuda.is_available() and not self.args["no_cuda"] else "cpu")
            self.args["n_gpu"] = torch.cuda.device_count()
        else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.cuda.set_device(self.args["local_rank"])
            device = torch.device("cuda", self.args["local_rank"])
            torch.distributed.init_process_group(backend="nccl")
            self.args["n_gpu"] = 1
        self.args["device"] = device

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO if self.args["local_rank"] in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            self.args["local_rank"],
            device,
            self.args["n_gpu"],
            bool(self.args["local_rank"] != -1),
            self.args["fp16"],
        )

        # Set seed
        self._set_seed()

        # Load pretrained model and tokenizer
        if self.args["local_rank"] not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

        config_class, model_class, tokenizer_class, _, _ = MODEL_CLASSES[self.args["model_type"]]

        if self.args.get("config_name"):
            config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"])
        elif self.args.get("model_name_or_path"):
            config = config_class.from_pretrained(self.args["model_name_or_path"], cache_dir=self.args["cache_dir"])
        else:
            config = config_class()

        if self.args["block_size"] <= 0:
            self.args["block_size"] = tokenizer.max_len
            # Our input block size will be the max possible for the model
        else:
            self.args["block_size"] = min(self.args["block_size"], tokenizer.max_len)

        if self.args.get("model_name_or_path"):
            model = model_class.from_pretrained(
                self.args["model_name_or_path"],
                from_tf=bool(".ckpt" in self.args["model_name_or_path"]),
                config=config,
                cache_dir=self.args["cache_dir"],
            )
        else:
            logger.info("New model needs training from scratch")
            model = model_class(config=config)

        model.to(self.args["device"])
        self.model = model
        self.model_class = model_class
        self.tokenizer = tokenizer
        self.tokenizer_class = tokenizer_class

    def train(self, train_dataset : Dataset, eval_dataset : Dataset = None):
        if self.args["local_rank"] == 0:
            torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

        logger.info("Training/evaluation parameters %s", self.args)

        # Training
        if self.args["local_rank"] not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        # train_dataset = self._load_and_cache_examples(self.tokenizer, trainfile)

        if self.args["local_rank"] == 0:
            torch.distributed.barrier()

        global_step, tr_loss = self._train(train_dataset, eval_dataset)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

        # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
        if (self.args["local_rank"] == -1 or torch.distributed.get_rank() == 0):
            # Create output directory if needed
            if self.args["local_rank"] in [-1, 0]:
                os.makedirs(self.args["output_dir"], exist_ok=True)

            logger.info("Saving model checkpoint to %s", self.args["output_dir"])
            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
            # They can then be reloaded using `from_pretrained()`
            model_to_save = (
                self.model.module if hasattr(self.model, "module") else self.model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(self.args["output_dir"])
            self.tokenizer.save_pretrained(self.args["output_dir"])

            # Good practice: save your training arguments together with the trained model
            torch.save(self.args, os.path.join(self.args["output_dir"], "training_args.bin"))

            # Load a trained model and vocabulary that you have fine-tuned
            self.model = self.model_class.from_pretrained(self.args["output_dir"])
            self.tokenizer = self.tokenizer_class.from_pretrained(self.args["output_dir"])
            self.model.to(self.args["device"])

        # Evaluation
        if eval_dataset and self.args["local_rank"] in [-1, 0]:
            self.evaluate(eval_dataset)

    def evaluate(self, eval_dataset : Dataset):
        results = {}
        checkpoints = [self.args["output_dir"]]
        if self.args["eval_all_checkpoints"]:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(self.args["output_dir"] + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = self.model_class.from_pretrained(checkpoint)
            model.to(self.args["device"])
            result = self._evaluate(eval_dataset, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        return results

    def _load_and_cache_examples(self, tokenizer, file_path:str):
        if self.args.get("line_by_line"):
            return LineByLineTextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])
        else:
            return TextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])


    def _sorted_checkpoints(self, output_dir: str, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
        ordering_and_checkpoint_path = []

        glob_checkpoints = glob.glob(os.path.join(output_dir, "{}-*".format(checkpoint_prefix)))

        for path in glob_checkpoints:
            if use_mtime:
                ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
            else:
                regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
                if regex_match and regex_match.groups():
                    ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
        checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
        return checkpoints_sorted

    def _rotate_checkpoints(self, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
        if not self.args.get("save_total_limit"):
            return
        if self.args["save_total_limit"] <= 0:
            return

        # Check if we should delete older checkpoint(s)
        checkpoints_sorted = self._sorted_checkpoints(self.args["output_dir"], checkpoint_prefix, use_mtime)
        if len(checkpoints_sorted) <= self.args["save_total_limit"]:
            return

        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args["save_total_limit"])
        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
        for checkpoint in checkpoints_to_be_deleted:
            logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
            shutil.rmtree(checkpoint)

    def _mask_tokens(self, inputs: torch.Tensor, tokenizer: PreTrainedTokenizer) -> Tuple[torch.Tensor, torch.Tensor]:
        """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """

        if tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
            )

        labels = inputs.clone()
        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
        probability_matrix = torch.full(labels.shape, self.args["mlm_probability"])
        special_tokens_mask = [
            tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if tokenizer._pad_token is not None:
            padding_mask = labels.eq(tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

    def _set_seed(self):
        random.seed(self.args["seed"])
        np.random.seed(self.args["seed"])
        torch.manual_seed(self.args["seed"])
        if self.args["n_gpu"] > 0:
            torch.cuda.manual_seed_all(self.args["seed"])

#    def _train_tokenizer(self, vocab_size : int, min_frequency : int):
#        print("Training " + self.filename)
#        tokenizer = ByteLevelBPETokenizer()
#        tokenizer.train(files=[self.filename], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
#            "<s>", "<pad>", "</s>", "<unk>", "<mask>"
#        ])
#        tokenizer.save(".", self.filename+".vocab")
#        print("Done.")
#        return tokenizer
#
#    def _getTokenizer(self, vocab_size : int = 7000, min_frequency : int = 5):
#        if (os.path.exists(self.filename+".vocab-merges.txt") & os.path.exists(self.filename+".vocab-vocab.json")):
#            tokenizer = ByteLevelBPETokenizer(self.filename+".vocab-vocab.json", self.filename+".vocab-merges.txt")
#        else:
#            tokenizer = self._train_tokenizer(vocab_size, min_frequency)
#        tokenizer._tokenizer.post_processor = BertProcessing(
#            ("</s>", tokenizer.token_to_id("</s>")),
#            ("<s>", tokenizer.token_to_id("<s>")),
#        )
#        tokenizer.enable_truncation(max_length=512)
#        return tokenizer

    def _train(self, train_dataset : Dataset, eval_dataset: Dataset = None) -> Tuple[int, float]:
        """ Train the model """
        if self.args["local_rank"] in [-1, 0]:
            tb_writer = SummaryWriter()

        self.args["train_batch_size"] = self.args["per_gpu_train_batch_size"] * max(1, self.args["n_gpu"])

        def collate(examples: List[torch.Tensor]):
            if self.tokenizer._pad_token is None:
                return pad_sequence(examples, batch_first=True)
            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)

        train_sampler = RandomSampler(train_dataset) if self.args["local_rank"] == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset, sampler=train_sampler, batch_size=self.args["train_batch_size"], collate_fn=collate
        )

        if self.args["max_steps"] > 0:
            t_total = self.args["max_steps"]
            self.args["num_train_epochs"] = self.args["max_steps"] // (len(train_dataloader) // self.args["gradient_accumulation_steps"]) + 1
        else:
            t_total = len(train_dataloader) // self.args["gradient_accumulation_steps"] * self.args["num_train_epochs"]

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.args["weight_decay"],
            },
            {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args["learning_rate"], eps=self.args["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.args["warmup_steps"], num_training_steps=t_total
        )

        # Check if saved optimizer or scheduler states exist
        if (
                self.args.get("model_name_or_path")
                and os.path.isfile(os.path.join(self.args["model_name_or_path"], "optimizer.pt"))
                and os.path.isfile(os.path.join(self.args["model_name_or_path"], "scheduler.pt"))
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "optimizer.pt")))
            scheduler.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "scheduler.pt")))

        if self.args.get("fp16"):
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=self.args["fp16_opt_level"])

        # multi-gpu training (should be after apex fp16 initialization)
        if self.args["n_gpu"] > 1:
            self.model = torch.nn.DataParallel(self.model)

        # Distributed training (should be after apex fp16 initialization)
        if self.args["local_rank"] != -1:
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model, device_ids=[self.args["local_rank"]], output_device=self.args["local_rank"], find_unused_parameters=True
            )

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Num Epochs = %d", self.args["num_train_epochs"])
        logger.info("  Instantaneous batch size per GPU = %d", self.args["per_gpu_train_batch_size"])
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.args["train_batch_size"]
            * self.args["gradient_accumulation_steps"]
            * (torch.distributed.get_world_size() if self.args["local_rank"] != -1 else 1),
            )
        logger.info("  Gradient Accumulation steps = %d", self.args["gradient_accumulation_steps"])
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if self.args["model_name_or_path"] and os.path.exists(self.args["model_name_or_path"]):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = self.args["model_name_or_path"].split("-")[-1].split("/")[0]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (len(train_dataloader) // self.args["gradient_accumulation_steps"])
                steps_trained_in_current_epoch = global_step % (len(train_dataloader) // self.args["gradient_accumulation_steps"])

                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
                logger.info("  Continuing training from epoch %d", epochs_trained)
                logger.info("  Continuing training from global step %d", global_step)
                logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
            except ValueError:
                logger.info("  Starting fine-tuning.")

        tr_loss, logging_loss = 0.0, 0.0

        model_to_resize = self.model.module if hasattr(self.model, "module") else self.model  # Take care of distributed/parallel training
        model_to_resize.resize_token_embeddings(len(self.tokenizer))

        self.model.zero_grad()
        train_iterator = trange(
            epochs_trained, int(self.args["num_train_epochs"]), desc="Epoch", disable=self.args["local_rank"] not in [-1, 0]
        )
        self._set_seed()  # Added here for reproducibility
        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args["local_rank"] not in [-1, 0])
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
                inputs = inputs.to(self.args["device"])
                labels = labels.to(self.args["device"])
                self.model.train()
                outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
                loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

                if self.args["n_gpu"] > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training
                if self.args["gradient_accumulation_steps"] > 1:
                    loss = loss / self.args["gradient_accumulation_steps"]

                if self.args.get("fp16"):
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args["gradient_accumulation_steps"] == 0:
                    if self.args.get("fp16"):
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args["max_grad_norm"])
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.args["local_rank"] in [-1, 0] and self.args["logging_steps"] > 0 and global_step % self.args["logging_steps"] == 0:
                        # Log metrics
                        if (
                                self.args["local_rank"] == -1 and eval_dataset
                        ):  # Only evaluate when single GPU otherwise metrics may not average well
                            results = self.evaluate(eval_dataset)
                            for key, value in results.items():
                                tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) / self.args["logging_steps"], global_step)
                        logging_loss = tr_loss

                    if self.args["local_rank"] in [-1, 0] and self.args["save_steps"] > 0 and global_step % self.args["save_steps"] == 0:
                        checkpoint_prefix = "checkpoint"
                        # Save model checkpoint
                        output_dir = os.path.join(self.args["output_dir"], "{}-{}".format(checkpoint_prefix, global_step))
                        os.makedirs(output_dir, exist_ok=True)
                        model_to_save = (
                            self.model.module if hasattr(self.model, "module") else self.model
                        )  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        self.tokenizer.save_pretrained(output_dir)

                        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
                        logger.info("Saving model checkpoint to %s", output_dir)

                        self._rotate_checkpoints(checkpoint_prefix)

                        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                        logger.info("Saving optimizer and scheduler states to %s", output_dir)

                if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
                    epoch_iterator.close()
                    break
            if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
                train_iterator.close()
                break

        if self.args["local_rank"] in [-1, 0]:
            tb_writer.close()

        return global_step, tr_loss / global_step

    def _evaluate(self, dataset: Dataset, prefix="") -> Dict:
        # Loop to handle MNLI double evaluation (matched, mis-matched)
        eval_output_dir = self.args["output_dir"]

        # eval_dataset = self._load_and_cache_examples(self.tokenizer, file)

        if self.args["local_rank"] in [-1, 0]:
            os.makedirs(eval_output_dir, exist_ok=True)

        self.args["eval_batch_size"] = self.args["per_gpu_eval_batch_size"] * max(1, self.args["n_gpu"])
        # Note that DistributedSampler samples randomly

        def collate(examples: List[torch.Tensor]):
            if self.tokenizer._pad_token is None:
                return pad_sequence(examples, batch_first=True)
            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(
            dataset, sampler=eval_sampler, batch_size=self.args["eval_batch_size"], collate_fn=collate
        )

        # multi-gpu evaluate
        if self.args["n_gpu"] > 1:
            self.model = torch.nn.DataParallel(self.model)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args["eval_batch_size"])
        eval_loss = 0.0
        nb_eval_steps = 0
        self.model.eval()

        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
            inputs = inputs.to(self.args["device"])
            labels = labels.to(self.args["device"])

            with torch.no_grad():
                outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
                lm_loss = outputs[0]
                eval_loss += lm_loss.mean().item()
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        perplexity = torch.exp(torch.tensor(eval_loss))

        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        return result

# ----------------------------------------------------------------------------------------------------------------------

class ModularModel(BaseModel):
    def __init__(self,
                 output_dir: str,
                 use_tokenizer : PreTrainedTokenizer = None,
                 model_type: str = "roberta",
                 cache_dir : str = None,
                 args=None
                 ):
        if cache_dir is None:
            cache_dir = output_dir + "/cache"
        if args is None:
            args = {}
        sorted_checkpoints = self._sorted_checkpoints(output_dir)
        model_name_or_path = None
        if len(sorted_checkpoints) != 0:
#                raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
#            else:
            model_name_or_path = sorted_checkpoints[-1]
        _, _, tokenizer_class, _, _ = MODEL_CLASSES[model_type]

        if model_name_or_path:
            tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
        elif Path(output_dir + "/tokenizer").exists():
            tokenizer = tokenizer_class.from_pretrained(output_dir + "/tokenizer", cache_dir=cache_dir)
        elif use_tokenizer:
            tokenizer = use_tokenizer
        else:
            raise ValueError(
                "No tokenizer provided, and no pretrained tokenizer available in " + output_dir + "/tokenizer :("
            )
        args.update({
                        "model_type": model_type,
                        "model_name_or_path": model_name_or_path,
                        "output_dir": output_dir + "/out",
                        "cache_dir": cache_dir
                    })
        super().__init__(args, tokenizer)


def trainTokenizer(output_dir: str, file: str, model_type : str = "roberta", vocab_size: int = 500, min_frequency: int = 50):
    _, _, tokenizer_class, base_tokenizer_class, processor = MODEL_CLASSES[model_type]

    if base_tokenizer_class is ByteLevelBPETokenizer:
        tokenizer = base_tokenizer_class()
        tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"

        ])
        if processor:
            tokenizer._tokenizer.post_processor = processor(
                ("</s>", tokenizer.token_to_id("</s>")),
                ("<s>", tokenizer.token_to_id("<s>")),
            )
    elif base_tokenizer_class is BertWordPieceTokenizer:
        tokenizer = base_tokenizer_class()
        tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency)
        tokenizer._tokenizer.post_processor = processor(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
        )
    else:
        print("???")

    tokenizer.enable_truncation(max_length=512)
    if not os.path.exists(output_dir + "/tokenizer"):
        os.makedirs(output_dir + "/tokenizer")
    tokenizer.save(output_dir + "/tokenizer", "")

    filenames = [*tokenizer_class.vocab_files_names.values()]

    for f in filenames:
        os.rename(output_dir + "/tokenizer/-" + f, output_dir + "/tokenizer/" + f)

    return tokenizer_class.from_pretrained(output_dir + "/tokenizer/")