Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- !pip install -q git+https://github.com/rwightman/pytorch-image-models.git
- !pip install -q torchsummary
- !pip install -q -U git+https://github.com/albu/albumentations --no-cache-dir
- !pip install -q neptune-client
- from IPython.display import clear_output
- clear_output()
- #import torch.nn as nn
- import torch.nn.init as init
- import sys
- import numpy as np
- import torch
- from torch.nn.parameter import Parameter
- import math
- import os
- from torchsummary import summary
- import warnings
- import random
- from matplotlib import pyplot as plt
- import seaborn as sns
- from typing import *
- import albumentations
- from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
- import cv2
- import neptune.new as neptune
- import numpy as np
- import pandas as pd
- import timm
- import torch
- import torch.nn.functional as F
- from albumentations.pytorch.transforms import ToTensorV2
- from sklearn.preprocessing import LabelEncoder
- from torch import nn
- from torch.autograd import Variable
- from torch.optim.lr_scheduler import _LRScheduler
- from torch.optim.optimizer import Optimizer
- from torchsummary import summary
- from torchvision import models
- from tqdm.notebook import tqdm
- import pandas as pd
- warnings.filterwarnings("ignore")
- from torch.utils.data import DataLoader, Dataset
- from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
- from IPython.display import clear_output
- clear_output()
- torch.cuda.empty_cache()
- import albumentations as A
- size = 512
- bs = 96
- #efficientnet_b0 nfnet_l0
- CONFIG = {
- "COMPETITION_NAME": "SETI",
- "MODEL": {"MODEL_FACTORY": "timm", "MODEL_NAME": "efficientnet_b0"},
- "WORKSPACE": "home",
- "DATA": {
- "TARGET_COL_NAME": "target",
- "IMAGE_COL_NAME": "id",
- "NUM_CLASSES": 1,
- "CLASS_LIST": [0, 1],
- "IMAGE_SIZE": size,
- "CHANNEL_MODE": "spatial_3ch",
- "USE_MIXUP": True
- #"USE_CUTMIX": False
- },
- "CROSS_VALIDATION": {"SCHEMA" : 'StratifiedKFold', "NUM_FOLDS": 5},
- "TRAIN": {
- "DATALOADER": {
- "batch_size": bs,
- "shuffle": True, #using random sampler
- "num_workers": 4,
- "drop_last": False,
- },
- "SETTINGS": {
- "IMAGE_SIZE": size,
- "NUM_EPOCHS": 60,
- "USE_AMP": True,
- "USE_GRAD_ACCUM": False,
- "ACCUMULATION_STEP": 1,
- "DEBUG": False,
- "VERBOSE": True,
- "VERBOSE_STEP": 10,
- },
- },
- "VALIDATION": {
- "DATALOADER": {
- "batch_size": 16,
- "shuffle": False,
- "num_workers": 4,
- "drop_last": False,
- }
- },
- "TEST": {
- "DATALOADER": {
- "batch_size": 16,
- "shuffle": False,
- "num_workers": 4,
- "drop_last": False,
- }
- },
- "OPTIMIZER": {
- "NAME": "AdamW",
- "OPTIMIZER_PARAMS": {"lr": 1e-4, "eps": 1.0e-8, "weight_decay": 1.0e-3},
- },
- "SCHEDULER": {
- "NAME": "CosineAnnealingWarmRestarts",
- "SCHEDULER_PARAMS": {
- "T_0": 4,
- "T_mult": 1,
- "eta_min": 1.0e-7,
- "last_epoch": -1,
- "verbose": True,
- },
- "CUSTOM": "GradualWarmupSchedulerV2",
- "CUSTOM_PARAMS": {"multiplier": 7, "total_epoch": 1},
- "VAL_STEP": False,
- },
- "CRITERION_TRAIN": {
- "NAME": "BCEWithLogitsLoss",
- "LOSS_PARAMS": {
- "weight": None,
- "size_average": None,
- "reduce": None,
- "reduction": "mean",
- "pos_weight": None
- },
- },
- "CRITERION_VALIDATION": {
- "NAME": "BCEWithLogitsLoss",
- "LOSS_PARAMS": {
- "weight": None,
- "size_average": None,
- "reduce": None,
- "reduction": "mean",
- "pos_weight": None
- },
- },
- "TRAIN_TRANSFORMS": {
- "VerticalFlip": {"p": 0.5},
- "HorizontalFlip": {"p": 0.5},
- "Resize": {"height": size, "width": size, "p": 1},
- },
- "VALID_TRANSFORMS": {
- "Resize": {"height": size, "width": size, "p": 1},
- },
- "TEST_TRANSFORMS": {
- "Resize": {"height": size, "width": size, "p": 1},
- },
- "PATH": {
- "DATA_DIR": "/home/apsisdev/data/seti/",
- "TRAIN_CSV": "/home/apsisdev/data/seti/seti-breakthrough-listen/train_labels.csv",
- "TRAIN_PATH": "/home/apsisdev/data/seti/seti-breakthrough-listen/train",
- "TEST_CSV": "/home/apsisdev/data/seti/seti-breakthrough-listen/sample_submission.csv",
- "TEST_PATH": "/home/apsisdev/data/seti/seti-breakthrough-listen/test",
- "SAVE_WEIGHT_PATH": "./",
- "OOF_PATH": "/home/apsisdev/data/seti/seti-breakthrough-listen/",
- "LOG_PATH": "/home/apsisdev/data/seti/seti-breakthrough-listen/log.txt"
- },
- "SEED": 63,
- "DEVICE": "cuda",
- "GPU": "rtx3090",
- }
- config = CONFIG
- def seed_all(seed: int = 63):
- """Seed all random number generators."""
- print("Using Seed Number {}".format(seed))
- os.environ["PYTHONHASHSEED"] = str(
- seed
- ) # set PYTHONHASHSEED env var at fixed value
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
- torch.cuda.manual_seed(seed) # pytorch (both CPU and CUDA)
- np.random.seed(seed) # for numpy pseudo-random generator
- random.seed(seed) # set fixed value for python built-in pseudo-random generator
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
- torch.backends.cudnn.enabled = False
- def seed_worker(_worker_id):
- """Seed a worker with the given ID."""
- worker_seed = torch.initial_seed() % 2 ** 32
- np.random.seed(worker_seed)
- random.seed(worker_seed)
- seed_all(config['SEED'])
- train = pd.read_csv(CONFIG['PATH']['TRAIN_CSV'])
- def get_train_file_path(image_id):
- if config['WORKSPACE'] == 'home':
- return "/home/apsisdev/data/seti/seti-breakthrough-listen/train/{}/{}.npy".format(image_id[0], image_id)
- elif config['WORKSPACE'] == 'Colab':
- return "/home/apsisdev/data/seti/seti-breakthrough-listen/{}/{}.npy".format(image_id[0], image_id)
- train['file_path'] = train['id'].apply(get_train_file_path)
- display(train.head())
- def make_folds(train_csv: pd.DataFrame, config) -> pd.DataFrame:
- """Split the given dataframe into training folds."""
- # TODO: add options for cv_scheme as it is cumbersome here.
- if config['CROSS_VALIDATION']['SCHEMA'] == "StratifiedKFold":
- df_folds = train_csv.copy()
- skf = StratifiedKFold(
- n_splits=config['CROSS_VALIDATION']['NUM_FOLDS'], shuffle=True, random_state=config['SEED']
- )
- for fold, (train_idx, val_idx) in enumerate(
- skf.split(
- X=df_folds[config['DATA']['IMAGE_COL_NAME']], y=df_folds[config['DATA']['TARGET_COL_NAME']]
- )
- ):
- df_folds.loc[val_idx, "fold"] = int(fold + 1)
- df_folds["fold"] = df_folds["fold"].astype(int)
- print(df_folds.groupby(["fold", config['DATA']['TARGET_COL_NAME']]).size())
- elif config.cv_schema == "GroupKfold":
- df_folds = train_csv.copy()
- gkf = GroupKFold(n_splits=config.num_folds)
- groups = df_folds[config.group_kfold_split].values
- for fold, (train_index, val_index) in enumerate(
- gkf.split(X=df_folds, y=df_folds[config.class_col_name], groups=groups)
- ):
- df_folds.loc[val_index, "fold"] = int(fold + 1)
- df_folds["fold"] = df_folds["fold"].astype(int)
- try:
- print(df_folds.groupby(["fold", config.class_col_name]).size())
- except:
- display(df_folds)
- else: # No CV Schema used in this file, but custom one
- df_folds = train_csv.copy()
- try:
- print(df_folds.groupby(["fold", config.class_col_name]).size())
- except:
- display(df_folds)
- return df_folds
- df_folds = make_folds(train, config)
- def gem(x, p=3, eps=1e-6):
- return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
- class GeM(nn.Module):
- def __init__(self, p=3, eps=1e-6):
- super(GeM,self).__init__()
- self.p = Parameter(torch.ones(1)*p)
- self.eps = eps
- def forward(self, x):
- return gem(x, p=self.p, eps=self.eps)
- def __repr__(self):
- return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'
- class Transform:
- def __init__(self, aug_kwargs: Dict):
- albu_augs = [getattr(A, name)(**kwargs)
- for name, kwargs in aug_kwargs.items()]
- albu_augs.append(ToTensorV2(p=1))
- self.transform = A.Compose(albu_augs)
- def __call__(self, image):
- image = self.transform(image=image)["image"]
- return image
- def mixup_data(x, y, alpha=1.0, use_cuda=True):
- if alpha > 0:
- lam = np.random.beta(alpha, alpha)
- #lam = max(lam, 1-lam)
- else:
- lam = 1
- batch_size = x.size()[0]
- if use_cuda:
- index = torch.randperm(batch_size).cuda()
- else:
- index = torch.randperm(batch_size)
- mixed_x = lam * x + (1 - lam) * x[index, :]
- y_a, y_b = y, y[index]
- return mixed_x, y_a, y_b, lam
- def mixup_criterion(criterion, pred, y_a, y_b, lam):
- return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
- class AlienTrainDataset(Dataset):
- def __init__(self, df, config, transform=None, mode = 'train'):
- self.df = df
- self.config = config
- self.file_names = df['file_path'].values
- self.labels = df[config['DATA']['TARGET_COL_NAME']].values
- self.transform = transform
- self.mode = mode
- def __len__(self):
- return len(self.df)
- def __getitem__(self, idx):
- image = np.load(self.file_names[idx])
- # print(image.shape) -> (6, 273, 256)
- if self.config['DATA']['CHANNEL_MODE'] == 'spatial_6ch':
- image = image.astype(np.float32)
- image = np.vstack(image) # no transpose here (1638, 256)
- # image = np.vstack(image).transpose((1, 0))
- # print(image.shape) -> (256, 1638)
- elif self.config['DATA']['CHANNEL_MODE'] == 'spatial_3ch':
- image = image[::2].astype(np.float32)
- image = np.vstack(image).transpose((1, 0))
- elif self.config['DATA']['CHANNEL_MODE'] == '6_channel':
- image = image.astype(np.float32)
- image = np.transpose(image, (1,2,0))
- elif self.config['DATA']['CHANNEL_MODE'] == '3_channel':
- image = image[::2].astype(np.float32)
- image = np.transpose(image, (1,2,0))
- if self.transform:
- image = self.transform(image)
- else:
- image = torch.from_numpy(image).float()
- if self.mode == 'test':
- return image
- else:
- label = torch.tensor(self.labels[idx]).float()
- return image, label
- train_dataset = AlienTrainDataset(train, config, transform=Transform(config["TRAIN_TRANSFORMS"]))
- class AverageLossMeter:
- """
- Computes and stores the average and current loss
- """
- def __init__(self):
- self.reset()
- def reset(self):
- self.curr_batch_avg_loss = 0
- self.avg = 0
- self.running_total_loss = 0
- self.count = 0
- def update(self, curr_batch_avg_loss: float, batch_size: str):
- self.curr_batch_avg_loss = curr_batch_avg_loss
- self.running_total_loss += curr_batch_avg_loss * batch_size
- self.count += batch_size
- self.avg = self.running_total_loss / self.count
- import warnings
- warnings.filterwarnings("ignore")
- from torch.optim.lr_scheduler import _LRScheduler
- from torch.optim.lr_scheduler import ReduceLROnPlateau
- ### Original Implementation ###
- class GradualWarmupScheduler(_LRScheduler):
- """Gradually warm-up(increasing) learning rate in optimizer.
- Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
- Args:
- optimizer (Optimizer): Wrapped optimizer.
- multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
- total_epoch: target learning rate is reached at total_epoch, gradually
- after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
- """
- def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
- self.multiplier = multiplier
- if self.multiplier < 1.0:
- raise ValueError("multiplier should be greater thant or equal to 1.")
- self.total_epoch = total_epoch
- self.after_scheduler = after_scheduler
- self.finished = False
- super(GradualWarmupScheduler, self).__init__(optimizer)
- def get_lr(self):
- if self.last_epoch > self.total_epoch:
- if self.after_scheduler:
- if not self.finished:
- self.after_scheduler.base_lrs = [
- base_lr * self.multiplier for base_lr in self.base_lrs
- ]
- self.finished = True
- return self.after_scheduler.get_last_lr()
- return [base_lr * self.multiplier for base_lr in self.base_lrs]
- if self.multiplier == 1.0:
- return [
- base_lr * (float(self.last_epoch) / self.total_epoch)
- for base_lr in self.base_lrs
- ]
- else:
- return [
- base_lr
- * ((self.multiplier - 1.0) * self.last_epoch / self.total_epoch + 1.0)
- for base_lr in self.base_lrs
- ]
- def step_ReduceLROnPlateau(self, metrics, epoch=None):
- if epoch is None:
- epoch = self.last_epoch + 1
- self.last_epoch = (
- epoch if epoch != 0 else 1
- ) # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
- if self.last_epoch <= self.total_epoch:
- warmup_lr = [
- base_lr
- * ((self.multiplier - 1.0) * self.last_epoch / self.total_epoch + 1.0)
- for base_lr in self.base_lrs
- ]
- for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
- param_group["lr"] = lr
- else:
- if epoch is None:
- self.after_scheduler.step(metrics, None)
- else:
- self.after_scheduler.step(metrics, epoch - self.total_epoch)
- def step(self, epoch=None, metrics=None):
- if type(self.after_scheduler) != ReduceLROnPlateau:
- if self.finished and self.after_scheduler:
- if epoch is None:
- self.after_scheduler.step(None)
- else:
- self.after_scheduler.step(epoch - self.total_epoch)
- self._last_lr = self.after_scheduler.get_last_lr()
- else:
- return super(GradualWarmupScheduler, self).step(epoch)
- else:
- self.step_ReduceLROnPlateau(metrics, epoch)
- ### Fix Warmup Bug here, a modified version of above.
- class GradualWarmupSchedulerV2(GradualWarmupScheduler):
- def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
- super(GradualWarmupSchedulerV2, self).__init__(
- optimizer, multiplier, total_epoch, after_scheduler
- )
- def get_lr(self):
- if self.last_epoch > self.total_epoch:
- if self.after_scheduler:
- if not self.finished:
- self.after_scheduler.base_lrs = [
- base_lr * self.multiplier for base_lr in self.base_lrs
- ]
- self.finished = True
- return self.after_scheduler.get_lr()
- return [base_lr * self.multiplier for base_lr in self.base_lrs]
- if self.multiplier == 1.0:
- return [
- base_lr * (float(self.last_epoch) / self.total_epoch)
- for base_lr in self.base_lrs
- ]
- else:
- return [
- base_lr
- * ((self.multiplier - 1.0) * self.last_epoch / self.total_epoch + 1.0)
- for base_lr in self.base_lrs
- ]
- sigmoid = torch.nn.Sigmoid()
- class Swish(torch.autograd.Function):
- @staticmethod
- def forward(ctx, i):
- result = i * sigmoid(i)
- ctx.save_for_backward(i)
- return result
- @staticmethod
- def backward(ctx, grad_output):
- i = ctx.saved_variables[0]
- sigmoid_i = sigmoid(i)
- return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
- class Swish_Module(torch.nn.Module):
- def forward(self, x):
- return Swish.apply(x)
- class AlienSingleHead(torch.nn.Module):
- """A custom model."""
- def __init__(
- self,
- config: type,
- pretrained: bool = True,
- ):
- """Construct a custom model."""
- super().__init__()
- self.config = config
- self.pretrained = pretrained
- print("Pretrained is {}".format(self.pretrained))
- # self.activation = Swish_Module()
- self.activation = Swish_Module()
- self.architecture = {
- "backbone": None,
- "bottleneck": None,
- "classifier_head": None,
- }
- def __setattr__(self, name, value):
- self.model.__setattr__(self, name, value)
- _model_factory = (
- timm.create_model
- if self.config["MODEL"]["MODEL_FACTORY"] == "timm"
- else geffnet.create_model
- )
- if config['DATA']['CHANNEL_MODE'] == 'spatial_6ch' or config['DATA']['CHANNEL_MODE'] == 'spatial_3ch':
- self.model = _model_factory(
- model_name=self.config["MODEL"]["MODEL_NAME"],
- pretrained=self.pretrained, in_chans=1) # set channel = 1 since we using spatial
- else:
- self.model = _model_factory(
- model_name=self.config["MODEL"]["MODEL_NAME"],
- pretrained=self.pretrained, in_chans=3) # set channel = 1 since we using spatial
- # reset head
- self.model.reset_classifier(num_classes=0, global_pool="avg")
- # after resetting, there is no longer any classifier head, therefore it is the backbone now.
- self.architecture["backbone"] = self.model
- # get out features of the last cnn layer from backbone, which is also the in features of the next layer
- self.in_features = self.architecture["backbone"].num_features
- self.single_head_fc = torch.nn.Sequential(
- torch.nn.Linear(self.in_features, self.in_features),
- self.activation,
- #torch.nn.Dropout(p=0.05),
- torch.nn.Linear(self.in_features, self.config["DATA"]["NUM_CLASSES"]),
- )
- self.architecture["classifier_head"] = self.single_head_fc
- # feature map after cnn layer
- def extract_features(self, x):
- feature_logits = self.architecture["backbone"](x)
- # TODO: caution, if you use forward_features, then you need reshape. See test.py
- return feature_logits
- def forward(self, x):
- feature_logits = self.extract_features(x)
- classifier_logits = self.architecture["classifier_head"](feature_logits)
- return classifier_logits
- model = AlienSingleHead(config,pretrained=False)
- train_dataset = AlienTrainDataset(train, config, transform=Transform(config["TRAIN_TRANSFORMS"]))
- train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True,
- num_workers=4, pin_memory=True, drop_last=True)
- def torchsummary_wrapper(model, image_size: Tuple):
- model_summary = summary(model, image_size)
- return model_summary
- """Model training."""
- import datetime
- import os
- import random
- import time
- import numpy as np
- import pandas as pd
- import pytz
- import sklearn
- import torch
- import torch.nn as nn
- from sklearn.model_selection import GroupKFold
- from torch.utils.data import DataLoader
- from tqdm import tqdm
- from sklearn.metrics import roc_auc_score
- class Trainer:
- """A class to perform model training."""
- def __init__(self, model, config, early_stopping=None, neptune=None):
- """Construct a Trainer instance."""
- self.model = model
- self.patience = 15
- self.config = config
- self.neptune = neptune
- self.early_stopping = early_stopping
- self.epoch = 0
- self.best_auc = 0
- self.log_path = self.config["PATH"]["LOG_PATH"]
- self.best_loss = np.inf
- self.num_epochs = self.config["TRAIN"]["SETTINGS"]["NUM_EPOCHS"]
- self.save_path = self.config["PATH"]["SAVE_WEIGHT_PATH"]
- if not os.path.exists(self.save_path):
- os.makedirs(self.save_path)
- self.device = self.config["DEVICE"]
- """scaler is only used when use_amp is True, use_amp is inside config."""
- if self.config["TRAIN"]["SETTINGS"]["USE_AMP"]:
- self.scaler = torch.cuda.amp.GradScaler()
- self.date = datetime.datetime.now(pytz.timezone("Asia/Dhaka")).strftime(
- "%Y-%m-%d"
- )
- self.log(f"Fitter prepared. Device is {self.device}")
- self.criterion_train = getattr(
- torch.nn, self.config["CRITERION_TRAIN"]["NAME"]
- )(**self.config["CRITERION_TRAIN"]["LOSS_PARAMS"]).to(self.device)
- self.criterion_val = getattr(
- torch.nn, self.config["CRITERION_VALIDATION"]["NAME"]
- )(**self.config["CRITERION_VALIDATION"]["LOSS_PARAMS"])
- self.optimizer = getattr(torch.optim, self.config["OPTIMIZER"]["NAME"])(
- self.model.parameters(), **self.config["OPTIMIZER"]["OPTIMIZER_PARAMS"]
- )
- self.scheduler = getattr(
- torch.optim.lr_scheduler, self.config["SCHEDULER"]["NAME"]
- )(optimizer=self.optimizer, **self.config["SCHEDULER"]["SCHEDULER_PARAMS"])
- self.scheduler_warmup = GradualWarmupSchedulerV2(
- self.optimizer,
- **self.config["SCHEDULER"]["CUSTOM_PARAMS"],
- after_scheduler=self.scheduler,
- ) # total epoch = warmup epoch
- self.val_predictions = None
- self.date = datetime.datetime.now(pytz.timezone("Asia/Dhaka")).strftime(
- "%Y-%m-%d"
- )
- self.log(
- "Trainer prepared. We are using {} device.".format(self.config["DEVICE"])
- )
- def fit(self, train_loader, val_loader, fold: int):
- """Fit the model on the given fold."""
- self.log(
- "Training on Fold {} and using {}".format(
- fold, self.config["MODEL"]["MODEL_NAME"]
- )
- )
- for _epoch in range(self.num_epochs):
- # Getting the learning rate after each epoch!
- current_lr = self.optimizer.param_groups[0]["lr"]
- timestamp = datetime.datetime.now(pytz.timezone("Asia/Dhaka")).strftime(
- "%Y-%m-%d %H-%M-%S"
- )
- # printing the lr and the timestamp after each epoch.
- self.log("\n{}\nLR: {}".format(timestamp, current_lr))
- # start time of training on the training set
- train_start_time = time.time()
- '''
- if(_epoch<6):
- print('light aug....')
- self.config['DATA']['USE_MIXUP']=False
- self.config['DATA']['USE_CUTMIX']=False
- elif(_epoch>5 and epoch<20):
- print('mixup without hesitation....')
- self.config['DATA']['USE_MIXUP']=True
- self.config['DATA']['USE_CUTMIX']=False
- else:
- print('cutmix....')
- self.config['DATA']['USE_MIXUP']= False
- self.config['DATA']['USE_CUTMIX']=True
- '''
- # train one epoch on the training set
- avg_train_loss = self.train_one_epoch(train_loader)
- # end time of training on the training set
- train_end_time = time.time()
- # formatting time to make it nicer
- train_elapsed_time = time.strftime(
- "%H:%M:%S", time.gmtime(train_end_time - train_start_time)
- )
- self.log(
- "[RESULT]: Train. Epoch {} | Avg Train Summary Loss: {:.3f} | "
- "Time Elapsed: {}".format(
- self.epoch + 1,
- avg_train_loss,
- train_elapsed_time,
- )
- )
- val_start_time = time.time()
- (
- avg_val_loss,
- avg_val_roc,
- val_predictions,
- ) = self.valid_one_epoch(val_loader)
- # here we get oof preds
- self.val_predictions = val_predictions
- val_end_time = time.time()
- val_elapsed_time = time.strftime(
- "%H:%M:%S", time.gmtime(val_end_time - val_start_time)
- )
- # self.neptune["Metrics/AUC"].log(avg_val_roc)
- self.log(
- "[RESULT]: Validation. Epoch: {} | "
- "Avg Validation Summary Loss: {:.3f} | "
- "Validation ROC: {:.3f} | Time Elapsed: {}".format(
- self.epoch + 1,
- avg_val_loss,
- avg_val_roc,
- val_elapsed_time,
- )
- )
- # added this flag right before early stopping to let user
- # know which metric im monitoring.
- self.monitored_metrics = avg_val_roc
- if self.early_stopping is not None:
- best_score, early_stop = self.early_stopping.should_stop(
- curr_epoch_score=self.monitored_metrics
- )
- self.best_loss = best_score
- self.save(
- "{}_best_loss_fold_{}.pt".format(
- self.config["MODEL"]["MODEL_NAME"], fold
- )
- )
- if early_stop:
- break
- else:
- if avg_val_loss < self.best_loss:
- self.best_loss = avg_val_loss
- if self.best_auc < avg_val_roc:
- self.best_auc = avg_val_roc
- self.save(
- os.path.join(
- self.save_path,
- "{}_{}_best_auc_fold_{}.pt".format(
- self.date, self.config["MODEL"]["MODEL_NAME"], fold
- ),
- )
- )
- self.patience = 15
- else:
- self.patience -= 1
- if self.patience == 0:
- print("Early Stopping")
- break
- '''
- CosineAnnealingWarmRestart
- '''
- self.scheduler_warmup.step()
- if _epoch==2: self.scheduler_warmup.step() # bug workaround
- if self.config["SCHEDULER"]["VAL_STEP"]:
- if isinstance(
- self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau
- ):
- self.scheduler.step(self.monitored_metrics)
- else:
- self.scheduler.step()
- # end of training, epoch + 1 so that self.epoch can be updated.
- self.epoch += 1
- curr_fold_best_checkpoint = self.load(
- os.path.join(
- self.save_path,
- "{}_{}_best_auc_fold_{}.pt".format(
- self.date, self.config["MODEL"]["MODEL_NAME"], fold
- ),
- )
- )
- return curr_fold_best_checkpoint
- def train_one_epoch(self, train_loader):
- """Train one epoch of the model."""
- # set to train mode
- #self.model.avg_pool = GeM()
- self.model.train()
- #self.config = myconfig
- # log metrics
- train_summary_loss = AverageLossMeter()
- # TODO: use Alex's ROC METER?
- # timer
- start_time = time.time()
- train_bar = train_loader
- # looping through train loader for one epoch, steps is the
- # number of times to go through each epoch
- for step, (images, labels) in enumerate(train_bar):
- if self.config['DATA']['USE_MIXUP']:
- images, labels = (
- images.float(),
- labels,
- )
- images, targets_a, targets_b, lam = mixup_data(images, labels.view(-1, 1), use_cuda=True)
- images, targets_a, targets_b = images.to(self.device), targets_a.to(self.device), targets_b.to(self.device)
- else:
- images, labels = (
- images.to(self.device).float(),
- labels.to(self.device),
- )
- batch_size = labels.shape[0]
- if (
- self.config["TRAIN"]["SETTINGS"]["USE_AMP"] is True
- and self.config["TRAIN"]["SETTINGS"]["USE_GRAD_ACCUM"] is False
- ):
- """I would think clearing gradients here is the correct way, as opposed to calling it last."""
- self.optimizer.zero_grad()
- with torch.cuda.amp.autocast():
- logits = self.model(images)
- if self.config["DATA"]["USE_MIXUP"]:
- train_loss = mixup_criterion(self.criterion_train, logits, targets_a, targets_b, lam)
- #train_loss = mixup_criterion(self.criterion_train,targets_a, targets_b, lam)
- else:
- train_loss = self.criterion_train(input=logits.view(-1), target=labels) # use view here for BCELogitLoss
- loss_value = train_loss.item()
- self.scaler.scale(train_loss).backward()
- self.scaler.step(self.optimizer)
- self.scaler.update()
- elif (
- self.config["TRAIN"]["SETTINGS"]["USE_AMP"] is True
- and self.config["TRAIN"]["SETTINGS"]["USE_GRAD_ACCUM"] is True
- ):
- with torch.cuda.amp.autocast():
- logits = self.model(images)
- train_loss = self.criterion_train(input=logits, target=labels)
- train_loss = (
- train_loss
- / self.config["TRAIN"]["SETTINGS"]["ACCUMULATION_STEP"]
- )
- loss_value = train_loss.item()
- self.scaler.scale(train_loss).backward()
- if (step + 1) % self.config["TRAIN"]["SETTINGS"][
- "ACCUMULATION_STEP"
- ] == 0:
- self.scaler.step(self.optimizer)
- self.scaler.update()
- self.optimizer.zero_grad()
- else:
- logits = self.model(images)
- train_loss = self.criterion_train(input=logits, target=labels)
- loss_value = train_loss.item()
- self.optimizer.zero_grad()
- train_loss.backward()
- self.optimizer.step()
- train_summary_loss.update(train_loss.item(), batch_size)
- # here onwards, we have already completed the necessary forward pass and backprop, so we can come out of the if else loop.
- y_true = labels.cpu().numpy()
- softmax_preds = torch.nn.Softmax(dim=1)(input=logits).cpu().detach().numpy()
- y_preds = np.argmax(a=softmax_preds, axis=1)
- # measure elapsed time
- end_time = time.time()
- #train_bar.set_description(f"loss: {train_summary_loss.avg:.3f}")
- if self.config["TRAIN"]["SETTINGS"]["VERBOSE"]:
- if (step % self.config["TRAIN"]["SETTINGS"]["VERBOSE_STEP"]) == 0:
- print(
- f"Train Steps {step}/{len(train_loader)}, "
- f"summary_loss: {train_summary_loss.avg:.3f}, "
- f"time: {(end_time - start_time):.3f}",
- end="\r",
- )
- return train_summary_loss.avg
- # @torch.no_grad
- def valid_one_epoch(self, val_loader):
- """Validate one training epoch."""
- # set to eval mode
- self.model.eval()
- print(self.device)
- # log metrics
- valid_summary_loss = AverageLossMeter()
- # timer
- start_time = time.time()
- LOGITS = []
- Y_TRUE = []
- Y_PROBS = []
- POSITIVE_CLASS_PROBS = []
- with torch.no_grad():
- for step, (images, labels) in enumerate(val_loader):
- images, labels = (
- images.to(self.device).float(),
- labels.to(self.device),
- )
- batch_size = labels.shape[0]
- print(images)
- logits = self.model(images)
- print(logits)
- val_loss = self.criterion_val(input=logits.view(-1), target=labels) # use view here for BCELogitLoss
- loss_value = val_loss.item()
- valid_summary_loss.update(loss_value, batch_size)
- sigmoid_preds = torch.sigmoid(logits)
- y_preds = np.argmax(a=sigmoid_preds.detach().cpu(), axis=1)
- LOGITS.append(logits.detach().cpu())
- Y_TRUE.append(labels.detach().cpu())
- Y_PROBS.append(sigmoid_preds.detach().cpu())
- end_time = time.time()
- if self.config["TRAIN"]["SETTINGS"]["VERBOSE"]:
- if (step % self.config["TRAIN"]["SETTINGS"]["VERBOSE_STEP"]) == 0:
- print(
- f"Validation Steps {step}/{len(val_loader)}, "
- + f"summary_loss: {valid_summary_loss.avg:.3f},"
- + f"time: {(end_time - start_time):.3f}",
- end="\r",
- )
- LOGITS = torch.cat(LOGITS).numpy()
- Y_TRUE = torch.cat(Y_TRUE).numpy()
- Y_PROBS = torch.cat(Y_PROBS).numpy()
- if self.config["DATA"]["NUM_CLASSES"] > 2:
- val_roc_auc_score = sklearn.metrics.roc_auc_score(
- y_true=Y_TRUE, y_score=Y_PROBS, multi_class="ovr"
- )
- else:
- val_roc_auc_score = sklearn.metrics.roc_auc_score(
- y_true=Y_TRUE, y_score=Y_PROBS
- )
- return (valid_summary_loss.avg, val_roc_auc_score, Y_PROBS)
- def save_model(self, path):
- """Save the trained model."""
- self.model.eval()
- torch.save(self.model.state_dict(), path)
- # will save the weight for the best val loss and corresponding oof preds
- def save(self, path):
- """Save the weight for the best evaluation loss."""
- self.model.eval()
- torch.save(
- {
- "model_state_dict": self.model.state_dict(),
- "optimizer_state_dict": self.optimizer.state_dict(),
- "scheduler_state_dict": self.scheduler.state_dict(),
- "best_auc": self.best_auc,
- "best_loss": self.best_loss,
- "epoch": self.epoch,
- "oof_preds": self.val_predictions,
- },
- path,
- )
- def load(self, path):
- """Load a model checkpoint from the given path."""
- checkpoint = torch.load(path)
- return checkpoint
- def log(self, message):
- """Log a message."""
- if self.config["TRAIN"]["SETTINGS"]["VERBOSE"]:
- print(message)
- with open(self.config["PATH"]["LOG_PATH"], "a+") as logger:
- logger.write(f"{message}\n")
- def train_on_fold(model, df_folds: pd.DataFrame, config, fold: int, neptune=None):
- """Train the model on the given fold."""
- model.to(config["DEVICE"])
- try:
- model_summary = torchsummary_wrapper(
- model, (1, config["DATA"]["IMAGE_SIZE"], config["DATA"]["IMAGE_SIZE"])
- )
- except RuntimeError:
- print("Check the channel number.")
- print("Model Summary: \n{}".format(model_summary))
- if config["TRAIN"]["SETTINGS"]["DEBUG"]:
- # args.n_epochs = 5
- df_train = df_folds[df_folds["fold"] != fold].sample(
- config["TRAIN"]["DATALOADER"]["batch_size"] * 128
- )
- df_valid = df_folds[df_folds["fold"] == fold].sample(
- config["TRAIN"]["DATALOADER"]["batch_size"] * 128
- )
- else:
- df_train = df_folds[df_folds["fold"] != fold].reset_index(drop=True)
- df_valid = df_folds[df_folds["fold"] == fold].reset_index(drop=True)
- dataset_train = AlienTrainDataset(
- config=config,
- df=df_train,
- mode="train",
- transform=Transform(config["TRAIN_TRANSFORMS"]),
- )
- dataset_valid = AlienTrainDataset(
- config=config,
- df=df_valid,
- mode="valid",
- transform=Transform(config["VALID_TRANSFORMS"]),
- )
- train_loader = torch.utils.data.DataLoader(
- dataset_train,
- # sampler=RandomSampler(dataset_train),
- **config["TRAIN"]["DATALOADER"],
- )
- valid_loader = torch.utils.data.DataLoader(
- dataset_valid, **config["VALIDATION"]["DATALOADER"]
- )
- hongnan_classifier = Trainer(model=model, config=config, neptune=neptune)
- curr_fold_best_checkpoint = hongnan_classifier.fit(train_loader, valid_loader, fold)
- # print(len(curr_fold_best_checkpoint["oof_preds"]))
- df_valid[
- [str(c) for c in range(config["DATA"]["NUM_CLASSES"])]
- ] = curr_fold_best_checkpoint["oof_preds"]
- # val_df["preds"] = curr_fold_best_checkpoint["oof_preds"].argmax(1)
- return df_valid
- def train_loop(
- model,
- df_folds: pd.DataFrame,
- config,
- fold_num: int = None,
- train_one_fold=False,
- neptune=None,
- ):
- """Perform the training loop on all folds. Here The CV score is the average of the validation fold metric.
- While the OOF score is the aggregation of all validation folds."""
- cv_score_list = []
- oof_df = pd.DataFrame()
- if train_one_fold:
- _oof_df = train_on_fold(
- model, df_folds=df_folds, config=config, fold=fold_num, neptune=neptune
- )
- _oof_df.to_csv(os.path.join(config["PATH"]["OOF_PATH"], "_oof.csv"))
- # curr_fold_best_score = get_oof_roc(config, _oof_df)
- # print("Fold {} OOF Score is {}".format(fold_num, curr_fold_best_score))
- else:
- """The below for loop code guarantees fold starts from 1 and not 0. https://stackoverflow.com/questions/33282444/pythonic-way-to-iterate-through-a-range-starting-at-1"""
- for fold in (
- number + 1 for number in range(config["CROSS_VALIDATION"]["NUM_FOLDS"])
- ):
- _oof_df = train_on_fold(
- model, df_folds=df_folds, config=config, fold=fold, neptune=neptune
- )
- oof_df = pd.concat([oof_df, _oof_df])
- curr_fold_best_score_dict, curr_fold_best_score = get_oof_roc(
- config, _oof_df
- )
- cv_score_list.append(curr_fold_best_score)
- print(
- "\n\n\nOOF Score for Fold {}: {}\n\n\n".format(
- fold, curr_fold_best_score
- )
- )
- print("CV score", np.mean(cv_score_list))
- print("Variance", np.var(cv_score_list))
- print("Five Folds OOF", get_oof_roc(config, oof_df))
- oof_df.to_csv(os.path.join(config["PATH"]["OOF_PATH"], "oof.csv"))
- model_pretrained = AlienSingleHead(config=config, pretrained=True)
- train_loop(
- model_pretrained, df_folds, config, fold_num=0, train_one_fold=True, neptune=None
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement