Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2020
223
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 35.54 KB | None | 0 0
  1. from pathlib import Path
  2. from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
  3. from tokenizers.processors import RobertaProcessing, BertProcessing
  4. import os
  5. import torch
  6. import logging
  7. from typing import Tuple, List, Dict
  8. from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
  9. from torch.utils.data.distributed import DistributedSampler
  10. from torch.nn.utils.rnn import pad_sequence
  11. from tqdm import tqdm, trange
  12. import random
  13. import numpy as np
  14. import glob
  15. import re
  16. import shutil
  17. import pickle
  18.  
  19. from transformers.src.transformers import (
  20.     WEIGHTS_NAME,
  21.     AdamW,
  22.     BertConfig,
  23.     BertForMaskedLM,
  24.     BertTokenizer,
  25.     CamembertConfig,
  26.     CamembertForMaskedLM,
  27.     CamembertTokenizer,
  28.     DistilBertConfig,
  29.     DistilBertForMaskedLM,
  30.     DistilBertTokenizer,
  31.     GPT2Config,
  32.     GPT2LMHeadModel,
  33.     GPT2Tokenizer,
  34.     OpenAIGPTConfig,
  35.     OpenAIGPTLMHeadModel,
  36.     OpenAIGPTTokenizer,
  37.     PreTrainedTokenizer,
  38.     RobertaConfig,
  39.     RobertaForMaskedLM,
  40.     RobertaTokenizer,
  41.     get_linear_schedule_with_warmup,
  42. )
  43. from torch.utils.tensorboard import SummaryWriter
  44.  
  45. logger = logging.getLogger(__name__)
  46.  
  47. MODEL_CLASSES = {
  48.     "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, ByteLevelBPETokenizer, None),
  49.     "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, ByteLevelBPETokenizer, None),
  50.     "bert": (BertConfig, BertForMaskedLM, BertTokenizer, BertWordPieceTokenizer, BertProcessing),
  51.     "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, ByteLevelBPETokenizer, RobertaProcessing),
  52.     "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, BertWordPieceTokenizer, BertProcessing),
  53.     "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer, None, None),
  54. }
  55.  
  56. class TextDataset(Dataset):
  57.     def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
  58.         assert os.path.isfile(file_path)
  59.  
  60.         block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
  61.  
  62.         directory, filename = os.path.split(file_path)
  63.         cached_features_file = os.path.join(
  64.             directory, args["model_type"] + "_cached_lm_" + str(block_size) + "_" + filename
  65.         )
  66.  
  67.         if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
  68.             logger.info("Loading features from cached file %s", cached_features_file)
  69.             with open(cached_features_file, "rb") as handle:
  70.                 self.examples = pickle.load(handle)
  71.         else:
  72.             logger.info("Creating features from dataset file at %s", directory)
  73.  
  74.             self.examples = []
  75.             with open(file_path, encoding="utf-8") as f:
  76.                 text = f.read()
  77.  
  78.             tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
  79.  
  80.             for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
  81.                 self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
  82.             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
  83.             # If your dataset is small, first you should loook for a bigger one :-) and second you
  84.             # can change this behavior by adding (model specific) padding.
  85.  
  86.             logger.info("Saving features into cached file %s", cached_features_file)
  87.             with open(cached_features_file, "wb") as handle:
  88.                 pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
  89.  
  90.     def __len__(self):
  91.         return len(self.examples)
  92.  
  93.     def __getitem__(self, item):
  94.         return torch.tensor(self.examples[item], dtype=torch.long)
  95.  
  96.  
  97. class LineByLineTextDataset(Dataset):
  98.     def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
  99.         assert os.path.isfile(file_path)
  100.         # Here, we do not cache the features, operating under the assumption
  101.         # that we will soon use fast multithreaded tokenizers from the
  102.         # `tokenizers` repo everywhere =)
  103.         logger.info("Creating features from dataset file at %s", file_path)
  104.  
  105.         with open(file_path, encoding="utf-8") as f:
  106.             lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
  107.  
  108.         self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
  109.  
  110.     def __len__(self):
  111.         return len(self.examples)
  112.  
  113.     def __getitem__(self, i):
  114.         return torch.tensor(self.examples[i], dtype=torch.long)
  115.  
  116.  
  117. class BaseModel(object):
  118.     default_options = {
  119.         "model_type": "roberta",
  120.         "line_by_line": False,
  121.         "should_continue": True,
  122.         "model_name_or_path": None,
  123.         "mlm": True,
  124.         "mlm_probability": 0.15,
  125.         "config_name": None,
  126.         "tokenizer_name": None,
  127.         "cache_dir": None,
  128.         "block_size": -1,
  129.         "evaluate_during_training": False,
  130.         "per_gpu_train_batch_size": 16,
  131.         "per_gpu_eval_batch_size": 16,
  132.         "gradient_accumulation_steps": 4,
  133.         "learning_rate": 5e-5,
  134.         "weight_decay": 0.0,
  135.         "adam_epsilon": 1e-8,
  136.         "max_grad_norm": 1.0,
  137.         "num_train_epochs": 1.0,
  138.         "max_steps": -1,
  139.         "warmup_steps": 0,
  140.         "logging_steps": 5,
  141.         "save_steps": 10,
  142.         "save_total_limit": None,
  143.         "eval_all_checkpoints": False,
  144.         "no_cuda": True,
  145.         "overwrite_output_dir": True,
  146.         "overwrite_cache": False,
  147.         "seed": 42,
  148.         "fp16": False,
  149.         "fp16_opt_level": "01",
  150.         "local_rank": -1,
  151.         "server_ip": "",
  152.         "server_port": ""
  153.     }
  154.  
  155.     def __init__(self, args: Dict, tokenizer: PreTrainedTokenizer):
  156.         self.args = BaseModel.default_options.copy()
  157.         self.args.update(args)
  158.  
  159.         if not self.args.get("output_dir"):
  160.             raise ValueError("No output directory specified!")
  161.  
  162.         if self.args["model_type"] in ["bert", "roberta", "distilbert", "camembert"] and not self.args["mlm"]:
  163.             raise ValueError(
  164.                 "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
  165.                 "flag (masked language modeling)."
  166.             )
  167.         elif self.args["model_type"] in ["openai-gpt", "gpt2"]:
  168.             self.args["mlm"] = False
  169.  
  170.         if (
  171.                 os.path.exists(self.args["output_dir"])
  172.                 and os.listdir(self.args["output_dir"])
  173.                 and not self.args["overwrite_output_dir"]
  174.         ):
  175.             raise ValueError(
  176.                 "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
  177.                     self.args["output_dir"]
  178.                 )
  179.             )
  180.  
  181.         # Setup distant debugging if needed
  182.         if self.args.get("server_ip") and self.args.get("server_port"):
  183.             # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
  184.             import ptvsd
  185.  
  186.             print("Waiting for debugger attach")
  187.             ptvsd.enable_attach(address=(self.args["server_ip"], self.args["server_port"]), redirect_output=True)
  188.             ptvsd.wait_for_attach()
  189.  
  190.         # Setup CUDA, GPU & distributed training
  191.         if self.args["local_rank"] == -1 or self.args["no_cuda"]:
  192.             device = torch.device("cuda" if torch.cuda.is_available() and not self.args["no_cuda"] else "cpu")
  193.             self.args["n_gpu"] = torch.cuda.device_count()
  194.         else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
  195.             torch.cuda.set_device(self.args["local_rank"])
  196.             device = torch.device("cuda", self.args["local_rank"])
  197.             torch.distributed.init_process_group(backend="nccl")
  198.             self.args["n_gpu"] = 1
  199.         self.args["device"] = device
  200.  
  201.         # Setup logging
  202.         logging.basicConfig(
  203.             format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
  204.             datefmt="%m/%d/%Y %H:%M:%S",
  205.             level=logging.INFO if self.args["local_rank"] in [-1, 0] else logging.WARN,
  206.         )
  207.         logger.warning(
  208.             "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
  209.             self.args["local_rank"],
  210.             device,
  211.             self.args["n_gpu"],
  212.             bool(self.args["local_rank"] != -1),
  213.             self.args["fp16"],
  214.         )
  215.  
  216.         # Set seed
  217.         self._set_seed()
  218.  
  219.         # Load pretrained model and tokenizer
  220.         if self.args["local_rank"] not in [-1, 0]:
  221.             torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
  222.  
  223.         config_class, model_class, tokenizer_class, _, _ = MODEL_CLASSES[self.args["model_type"]]
  224.  
  225.         if self.args.get("config_name"):
  226.             config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"])
  227.         elif self.args.get("model_name_or_path"):
  228.             config = config_class.from_pretrained(self.args["model_name_or_path"], cache_dir=self.args["cache_dir"])
  229.         else:
  230.             config = config_class()
  231.  
  232.         if self.args["block_size"] <= 0:
  233.             self.args["block_size"] = tokenizer.max_len
  234.             # Our input block size will be the max possible for the model
  235.         else:
  236.             self.args["block_size"] = min(self.args["block_size"], tokenizer.max_len)
  237.  
  238.         if self.args.get("model_name_or_path"):
  239.             model = model_class.from_pretrained(
  240.                 self.args["model_name_or_path"],
  241.                 from_tf=bool(".ckpt" in self.args["model_name_or_path"]),
  242.                 config=config,
  243.                 cache_dir=self.args["cache_dir"],
  244.             )
  245.         else:
  246.             logger.info("New model needs training from scratch")
  247.             model = model_class(config=config)
  248.  
  249.         model.to(self.args["device"])
  250.         self.model = model
  251.         self.model_class = model_class
  252.         self.tokenizer = tokenizer
  253.         self.tokenizer_class = tokenizer_class
  254.  
  255.     def train(self, train_dataset : Dataset, eval_dataset : Dataset = None):
  256.         if self.args["local_rank"] == 0:
  257.             torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
  258.  
  259.         logger.info("Training/evaluation parameters %s", self.args)
  260.  
  261.         # Training
  262.         if self.args["local_rank"] not in [-1, 0]:
  263.             torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
  264.  
  265.         # train_dataset = self._load_and_cache_examples(self.tokenizer, trainfile)
  266.  
  267.         if self.args["local_rank"] == 0:
  268.             torch.distributed.barrier()
  269.  
  270.         global_step, tr_loss = self._train(train_dataset, eval_dataset)
  271.         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
  272.  
  273.         # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
  274.         if (self.args["local_rank"] == -1 or torch.distributed.get_rank() == 0):
  275.             # Create output directory if needed
  276.             if self.args["local_rank"] in [-1, 0]:
  277.                 os.makedirs(self.args["output_dir"], exist_ok=True)
  278.  
  279.             logger.info("Saving model checkpoint to %s", self.args["output_dir"])
  280.             # Save a trained model, configuration and tokenizer using `save_pretrained()`.
  281.             # They can then be reloaded using `from_pretrained()`
  282.             model_to_save = (
  283.                 self.model.module if hasattr(self.model, "module") else self.model
  284.             )  # Take care of distributed/parallel training
  285.             model_to_save.save_pretrained(self.args["output_dir"])
  286.             self.tokenizer.save_pretrained(self.args["output_dir"])
  287.  
  288.             # Good practice: save your training arguments together with the trained model
  289.             torch.save(self.args, os.path.join(self.args["output_dir"], "training_args.bin"))
  290.  
  291.             # Load a trained model and vocabulary that you have fine-tuned
  292.             self.model = self.model_class.from_pretrained(self.args["output_dir"])
  293.             self.tokenizer = self.tokenizer_class.from_pretrained(self.args["output_dir"])
  294.             self.model.to(self.args["device"])
  295.  
  296.         # Evaluation
  297.         if eval_dataset and self.args["local_rank"] in [-1, 0]:
  298.             self.evaluate(eval_dataset)
  299.  
  300.     def evaluate(self, eval_dataset : Dataset):
  301.         results = {}
  302.         checkpoints = [self.args["output_dir"]]
  303.         if self.args["eval_all_checkpoints"]:
  304.             checkpoints = list(
  305.                 os.path.dirname(c) for c in sorted(glob.glob(self.args["output_dir"] + "/**/" + WEIGHTS_NAME, recursive=True))
  306.             )
  307.             logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
  308.         logger.info("Evaluate the following checkpoints: %s", checkpoints)
  309.         for checkpoint in checkpoints:
  310.             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
  311.             prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
  312.  
  313.             model = self.model_class.from_pretrained(checkpoint)
  314.             model.to(self.args["device"])
  315.             result = self._evaluate(eval_dataset, prefix=prefix)
  316.             result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
  317.             results.update(result)
  318.  
  319.         return results
  320.  
  321.     def _load_and_cache_examples(self, tokenizer, file_path:str):
  322.         if self.args.get("line_by_line"):
  323.             return LineByLineTextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])
  324.         else:
  325.             return TextDataset(tokenizer, self.args, file_path=file_path, block_size=self.args["block_size"])
  326.  
  327.  
  328.     def _sorted_checkpoints(self, output_dir: str, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
  329.         ordering_and_checkpoint_path = []
  330.  
  331.         glob_checkpoints = glob.glob(os.path.join(output_dir, "{}-*".format(checkpoint_prefix)))
  332.  
  333.         for path in glob_checkpoints:
  334.             if use_mtime:
  335.                 ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
  336.             else:
  337.                 regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
  338.                 if regex_match and regex_match.groups():
  339.                     ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
  340.  
  341.         checkpoints_sorted = sorted(ordering_and_checkpoint_path)
  342.         checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
  343.         return checkpoints_sorted
  344.  
  345.     def _rotate_checkpoints(self, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
  346.         if not self.args.get("save_total_limit"):
  347.             return
  348.         if self.args["save_total_limit"] <= 0:
  349.             return
  350.  
  351.         # Check if we should delete older checkpoint(s)
  352.         checkpoints_sorted = self._sorted_checkpoints(self.args["output_dir"], checkpoint_prefix, use_mtime)
  353.         if len(checkpoints_sorted) <= self.args["save_total_limit"]:
  354.             return
  355.  
  356.         number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args["save_total_limit"])
  357.         checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
  358.         for checkpoint in checkpoints_to_be_deleted:
  359.             logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
  360.             shutil.rmtree(checkpoint)
  361.  
  362.     def _mask_tokens(self, inputs: torch.Tensor, tokenizer: PreTrainedTokenizer) -> Tuple[torch.Tensor, torch.Tensor]:
  363.         """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
  364.  
  365.         if tokenizer.mask_token is None:
  366.             raise ValueError(
  367.                 "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
  368.             )
  369.  
  370.         labels = inputs.clone()
  371.         # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
  372.         probability_matrix = torch.full(labels.shape, self.args["mlm_probability"])
  373.         special_tokens_mask = [
  374.             tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
  375.         ]
  376.         probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
  377.         if tokenizer._pad_token is not None:
  378.             padding_mask = labels.eq(tokenizer.pad_token_id)
  379.             probability_matrix.masked_fill_(padding_mask, value=0.0)
  380.         masked_indices = torch.bernoulli(probability_matrix).bool()
  381.         labels[~masked_indices] = -100  # We only compute loss on masked tokens
  382.  
  383.         # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
  384.         indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
  385.         inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
  386.  
  387.         # 10% of the time, we replace masked input tokens with random word
  388.         indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
  389.         random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
  390.         inputs[indices_random] = random_words[indices_random]
  391.  
  392.         # The rest of the time (10% of the time) we keep the masked input tokens unchanged
  393.         return inputs, labels
  394.  
  395.     def _set_seed(self):
  396.         random.seed(self.args["seed"])
  397.         np.random.seed(self.args["seed"])
  398.         torch.manual_seed(self.args["seed"])
  399.         if self.args["n_gpu"] > 0:
  400.             torch.cuda.manual_seed_all(self.args["seed"])
  401.  
  402. #    def _train_tokenizer(self, vocab_size : int, min_frequency : int):
  403. #        print("Training " + self.filename)
  404. #        tokenizer = ByteLevelBPETokenizer()
  405. #        tokenizer.train(files=[self.filename], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
  406. #            "<s>", "<pad>", "</s>", "<unk>", "<mask>"
  407. #        ])
  408. #        tokenizer.save(".", self.filename+".vocab")
  409. #        print("Done.")
  410. #        return tokenizer
  411. #
  412. #    def _getTokenizer(self, vocab_size : int = 7000, min_frequency : int = 5):
  413. #        if (os.path.exists(self.filename+".vocab-merges.txt") & os.path.exists(self.filename+".vocab-vocab.json")):
  414. #            tokenizer = ByteLevelBPETokenizer(self.filename+".vocab-vocab.json", self.filename+".vocab-merges.txt")
  415. #        else:
  416. #            tokenizer = self._train_tokenizer(vocab_size, min_frequency)
  417. #        tokenizer._tokenizer.post_processor = BertProcessing(
  418. #            ("</s>", tokenizer.token_to_id("</s>")),
  419. #            ("<s>", tokenizer.token_to_id("<s>")),
  420. #        )
  421. #        tokenizer.enable_truncation(max_length=512)
  422. #        return tokenizer
  423.  
  424.     def _train(self, train_dataset : Dataset, eval_dataset: Dataset = None) -> Tuple[int, float]:
  425.         """ Train the model """
  426.         if self.args["local_rank"] in [-1, 0]:
  427.             tb_writer = SummaryWriter()
  428.  
  429.         self.args["train_batch_size"] = self.args["per_gpu_train_batch_size"] * max(1, self.args["n_gpu"])
  430.  
  431.         def collate(examples: List[torch.Tensor]):
  432.             if self.tokenizer._pad_token is None:
  433.                 return pad_sequence(examples, batch_first=True)
  434.             return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
  435.  
  436.         train_sampler = RandomSampler(train_dataset) if self.args["local_rank"] == -1 else DistributedSampler(train_dataset)
  437.         train_dataloader = DataLoader(
  438.             train_dataset, sampler=train_sampler, batch_size=self.args["train_batch_size"], collate_fn=collate
  439.         )
  440.  
  441.         if self.args["max_steps"] > 0:
  442.             t_total = self.args["max_steps"]
  443.             self.args["num_train_epochs"] = self.args["max_steps"] // (len(train_dataloader) // self.args["gradient_accumulation_steps"]) + 1
  444.         else:
  445.             t_total = len(train_dataloader) // self.args["gradient_accumulation_steps"] * self.args["num_train_epochs"]
  446.  
  447.         # Prepare optimizer and schedule (linear warmup and decay)
  448.         no_decay = ["bias", "LayerNorm.weight"]
  449.         optimizer_grouped_parameters = [
  450.             {
  451.                 "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
  452.                 "weight_decay": self.args["weight_decay"],
  453.             },
  454.             {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
  455.         ]
  456.         optimizer = AdamW(optimizer_grouped_parameters, lr=self.args["learning_rate"], eps=self.args["adam_epsilon"])
  457.         scheduler = get_linear_schedule_with_warmup(
  458.             optimizer, num_warmup_steps=self.args["warmup_steps"], num_training_steps=t_total
  459.         )
  460.  
  461.         # Check if saved optimizer or scheduler states exist
  462.         if (
  463.                 self.args.get("model_name_or_path")
  464.                 and os.path.isfile(os.path.join(self.args["model_name_or_path"], "optimizer.pt"))
  465.                 and os.path.isfile(os.path.join(self.args["model_name_or_path"], "scheduler.pt"))
  466.         ):
  467.             # Load in optimizer and scheduler states
  468.             optimizer.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "optimizer.pt")))
  469.             scheduler.load_state_dict(torch.load(os.path.join(self.args["model_name_or_path"], "scheduler.pt")))
  470.  
  471.         if self.args.get("fp16"):
  472.             try:
  473.                 from apex import amp
  474.             except ImportError:
  475.                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
  476.             self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=self.args["fp16_opt_level"])
  477.  
  478.         # multi-gpu training (should be after apex fp16 initialization)
  479.         if self.args["n_gpu"] > 1:
  480.             self.model = torch.nn.DataParallel(self.model)
  481.  
  482.         # Distributed training (should be after apex fp16 initialization)
  483.         if self.args["local_rank"] != -1:
  484.             self.model = torch.nn.parallel.DistributedDataParallel(
  485.                 self.model, device_ids=[self.args["local_rank"]], output_device=self.args["local_rank"], find_unused_parameters=True
  486.             )
  487.  
  488.         # Train!
  489.         logger.info("***** Running training *****")
  490.         logger.info("  Num examples = %d", len(train_dataset))
  491.         logger.info("  Num Epochs = %d", self.args["num_train_epochs"])
  492.         logger.info("  Instantaneous batch size per GPU = %d", self.args["per_gpu_train_batch_size"])
  493.         logger.info(
  494.             "  Total train batch size (w. parallel, distributed & accumulation) = %d",
  495.             self.args["train_batch_size"]
  496.             * self.args["gradient_accumulation_steps"]
  497.             * (torch.distributed.get_world_size() if self.args["local_rank"] != -1 else 1),
  498.             )
  499.         logger.info("  Gradient Accumulation steps = %d", self.args["gradient_accumulation_steps"])
  500.         logger.info("  Total optimization steps = %d", t_total)
  501.  
  502.         global_step = 0
  503.         epochs_trained = 0
  504.         steps_trained_in_current_epoch = 0
  505.         # Check if continuing training from a checkpoint
  506.         if self.args["model_name_or_path"] and os.path.exists(self.args["model_name_or_path"]):
  507.             try:
  508.                 # set global_step to gobal_step of last saved checkpoint from model path
  509.                 checkpoint_suffix = self.args["model_name_or_path"].split("-")[-1].split("/")[0]
  510.                 global_step = int(checkpoint_suffix)
  511.                 epochs_trained = global_step // (len(train_dataloader) // self.args["gradient_accumulation_steps"])
  512.                 steps_trained_in_current_epoch = global_step % (len(train_dataloader) // self.args["gradient_accumulation_steps"])
  513.  
  514.                 logger.info("  Continuing training from checkpoint, will skip to saved global_step")
  515.                 logger.info("  Continuing training from epoch %d", epochs_trained)
  516.                 logger.info("  Continuing training from global step %d", global_step)
  517.                 logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
  518.             except ValueError:
  519.                 logger.info("  Starting fine-tuning.")
  520.  
  521.         tr_loss, logging_loss = 0.0, 0.0
  522.  
  523.         model_to_resize = self.model.module if hasattr(self.model, "module") else self.model  # Take care of distributed/parallel training
  524.         model_to_resize.resize_token_embeddings(len(self.tokenizer))
  525.  
  526.         self.model.zero_grad()
  527.         train_iterator = trange(
  528.             epochs_trained, int(self.args["num_train_epochs"]), desc="Epoch", disable=self.args["local_rank"] not in [-1, 0]
  529.         )
  530.         self._set_seed()  # Added here for reproducibility
  531.         for _ in train_iterator:
  532.             epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args["local_rank"] not in [-1, 0])
  533.             for step, batch in enumerate(epoch_iterator):
  534.  
  535.                 # Skip past any already trained steps if resuming training
  536.                 if steps_trained_in_current_epoch > 0:
  537.                     steps_trained_in_current_epoch -= 1
  538.                     continue
  539.  
  540.                 inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
  541.                 inputs = inputs.to(self.args["device"])
  542.                 labels = labels.to(self.args["device"])
  543.                 self.model.train()
  544.                 outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
  545.                 loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
  546.  
  547.                 if self.args["n_gpu"] > 1:
  548.                     loss = loss.mean()  # mean() to average on multi-gpu parallel training
  549.                 if self.args["gradient_accumulation_steps"] > 1:
  550.                     loss = loss / self.args["gradient_accumulation_steps"]
  551.  
  552.                 if self.args.get("fp16"):
  553.                     with amp.scale_loss(loss, optimizer) as scaled_loss:
  554.                         scaled_loss.backward()
  555.                 else:
  556.                     loss.backward()
  557.  
  558.                 tr_loss += loss.item()
  559.                 if (step + 1) % self.args["gradient_accumulation_steps"] == 0:
  560.                     if self.args.get("fp16"):
  561.                         torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args["max_grad_norm"])
  562.                     else:
  563.                         torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args["max_grad_norm"])
  564.                     optimizer.step()
  565.                     scheduler.step()  # Update learning rate schedule
  566.                     self.model.zero_grad()
  567.                     global_step += 1
  568.  
  569.                     if self.args["local_rank"] in [-1, 0] and self.args["logging_steps"] > 0 and global_step % self.args["logging_steps"] == 0:
  570.                         # Log metrics
  571.                         if (
  572.                                 self.args["local_rank"] == -1 and eval_dataset
  573.                         ):  # Only evaluate when single GPU otherwise metrics may not average well
  574.                             results = self.evaluate(eval_dataset)
  575.                             for key, value in results.items():
  576.                                 tb_writer.add_scalar("eval_{}".format(key), value, global_step)
  577.                         tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
  578.                         tb_writer.add_scalar("loss", (tr_loss - logging_loss) / self.args["logging_steps"], global_step)
  579.                         logging_loss = tr_loss
  580.  
  581.                     if self.args["local_rank"] in [-1, 0] and self.args["save_steps"] > 0 and global_step % self.args["save_steps"] == 0:
  582.                         checkpoint_prefix = "checkpoint"
  583.                         # Save model checkpoint
  584.                         output_dir = os.path.join(self.args["output_dir"], "{}-{}".format(checkpoint_prefix, global_step))
  585.                         os.makedirs(output_dir, exist_ok=True)
  586.                         model_to_save = (
  587.                             self.model.module if hasattr(self.model, "module") else self.model
  588.                         )  # Take care of distributed/parallel training
  589.                         model_to_save.save_pretrained(output_dir)
  590.                         self.tokenizer.save_pretrained(output_dir)
  591.  
  592.                         torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
  593.                         logger.info("Saving model checkpoint to %s", output_dir)
  594.  
  595.                         self._rotate_checkpoints(checkpoint_prefix)
  596.  
  597.                         torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
  598.                         torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
  599.                         logger.info("Saving optimizer and scheduler states to %s", output_dir)
  600.  
  601.                 if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
  602.                     epoch_iterator.close()
  603.                     break
  604.             if self.args["max_steps"] > 0 and global_step > self.args["max_steps"]:
  605.                 train_iterator.close()
  606.                 break
  607.  
  608.         if self.args["local_rank"] in [-1, 0]:
  609.             tb_writer.close()
  610.  
  611.         return global_step, tr_loss / global_step
  612.  
  613.     def _evaluate(self, dataset: Dataset, prefix="") -> Dict:
  614.         # Loop to handle MNLI double evaluation (matched, mis-matched)
  615.         eval_output_dir = self.args["output_dir"]
  616.  
  617.         # eval_dataset = self._load_and_cache_examples(self.tokenizer, file)
  618.  
  619.         if self.args["local_rank"] in [-1, 0]:
  620.             os.makedirs(eval_output_dir, exist_ok=True)
  621.  
  622.         self.args["eval_batch_size"] = self.args["per_gpu_eval_batch_size"] * max(1, self.args["n_gpu"])
  623.         # Note that DistributedSampler samples randomly
  624.  
  625.         def collate(examples: List[torch.Tensor]):
  626.             if self.tokenizer._pad_token is None:
  627.                 return pad_sequence(examples, batch_first=True)
  628.             return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
  629.  
  630.         eval_sampler = SequentialSampler(dataset)
  631.         eval_dataloader = DataLoader(
  632.             dataset, sampler=eval_sampler, batch_size=self.args["eval_batch_size"], collate_fn=collate
  633.         )
  634.  
  635.         # multi-gpu evaluate
  636.         if self.args["n_gpu"] > 1:
  637.             self.model = torch.nn.DataParallel(self.model)
  638.  
  639.         # Eval!
  640.         logger.info("***** Running evaluation {} *****".format(prefix))
  641.         logger.info("  Num examples = %d", len(dataset))
  642.         logger.info("  Batch size = %d", self.args["eval_batch_size"])
  643.         eval_loss = 0.0
  644.         nb_eval_steps = 0
  645.         self.model.eval()
  646.  
  647.         for batch in tqdm(eval_dataloader, desc="Evaluating"):
  648.             inputs, labels = self._mask_tokens(batch, self.tokenizer) if self.args.get("mlm") else (batch, batch)
  649.             inputs = inputs.to(self.args["device"])
  650.             labels = labels.to(self.args["device"])
  651.  
  652.             with torch.no_grad():
  653.                 outputs = self.model(inputs, masked_lm_labels=labels) if self.args.get("mlm") else self.model(inputs, labels=labels)
  654.                 lm_loss = outputs[0]
  655.                 eval_loss += lm_loss.mean().item()
  656.             nb_eval_steps += 1
  657.  
  658.         eval_loss = eval_loss / nb_eval_steps
  659.         perplexity = torch.exp(torch.tensor(eval_loss))
  660.  
  661.         result = {"perplexity": perplexity}
  662.  
  663.         output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
  664.         with open(output_eval_file, "w") as writer:
  665.             logger.info("***** Eval results {} *****".format(prefix))
  666.             for key in sorted(result.keys()):
  667.                 logger.info("  %s = %s", key, str(result[key]))
  668.                 writer.write("%s = %s\n" % (key, str(result[key])))
  669.  
  670.         return result
  671.  
  672. # ----------------------------------------------------------------------------------------------------------------------
  673.  
  674. class ModularModel(BaseModel):
  675.     def __init__(self,
  676.                  output_dir: str,
  677.                  use_tokenizer : PreTrainedTokenizer = None,
  678.                  model_type: str = "roberta",
  679.                  cache_dir : str = None,
  680.                  args=None
  681.                  ):
  682.         if cache_dir is None:
  683.             cache_dir = output_dir + "/cache"
  684.         if args is None:
  685.             args = {}
  686.         sorted_checkpoints = self._sorted_checkpoints(output_dir)
  687.         model_name_or_path = None
  688.         if len(sorted_checkpoints) != 0:
  689. #                raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
  690. #            else:
  691.             model_name_or_path = sorted_checkpoints[-1]
  692.         _, _, tokenizer_class, _, _ = MODEL_CLASSES[model_type]
  693.  
  694.         if model_name_or_path:
  695.             tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
  696.         elif Path(output_dir + "/tokenizer").exists():
  697.             tokenizer = tokenizer_class.from_pretrained(output_dir + "/tokenizer", cache_dir=cache_dir)
  698.         elif use_tokenizer:
  699.             tokenizer = use_tokenizer
  700.         else:
  701.             raise ValueError(
  702.                 "No tokenizer provided, and no pretrained tokenizer available in " + output_dir + "/tokenizer :("
  703.             )
  704.         args.update({
  705.                         "model_type": model_type,
  706.                         "model_name_or_path": model_name_or_path,
  707.                         "output_dir": output_dir + "/out",
  708.                         "cache_dir": cache_dir
  709.                     })
  710.         super().__init__(args, tokenizer)
  711.  
  712.  
  713.  
  714. def trainTokenizer(output_dir: str, file: str, model_type : str = "roberta", vocab_size: int = 500, min_frequency: int = 50):
  715.     _, _, tokenizer_class, base_tokenizer_class, processor = MODEL_CLASSES[model_type]
  716.  
  717.     if base_tokenizer_class is ByteLevelBPETokenizer:
  718.         tokenizer = base_tokenizer_class()
  719.         tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
  720.             "<s>",
  721.             "<pad>",
  722.             "</s>",
  723.             "<unk>",
  724.             "<mask>"
  725.  
  726.         ])
  727.         if processor:
  728.             tokenizer._tokenizer.post_processor = processor(
  729.                 ("</s>", tokenizer.token_to_id("</s>")),
  730.                 ("<s>", tokenizer.token_to_id("<s>")),
  731.             )
  732.     elif base_tokenizer_class is BertWordPieceTokenizer:
  733.         tokenizer = base_tokenizer_class()
  734.         tokenizer.train(files=[file], vocab_size=vocab_size, min_frequency=min_frequency)
  735.         tokenizer._tokenizer.post_processor = processor(
  736.             ("[SEP]", tokenizer.token_to_id("[SEP]")),
  737.             ("[CLS]", tokenizer.token_to_id("[CLS]")),
  738.         )
  739.     else:
  740.         print("???")
  741.  
  742.     tokenizer.enable_truncation(max_length=512)
  743.     if not os.path.exists(output_dir + "/tokenizer"):
  744.         os.makedirs(output_dir + "/tokenizer")
  745.     tokenizer.save(output_dir + "/tokenizer", "")
  746.  
  747.     filenames = [*tokenizer_class.vocab_files_names.values()]
  748.  
  749.     for f in filenames:
  750.         os.rename(output_dir + "/tokenizer/-" + f, output_dir + "/tokenizer/" + f)
  751.  
  752.     return tokenizer_class.from_pretrained(output_dir + "/tokenizer/")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement