Maroxtn

Untitled

Jun 17th, 2021
626
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[ ]:
  5.  
  6.  
  7.  
  8. import io
  9. import os
  10. import torch
  11. from tqdm.notebook import tqdm
  12. from torch.utils.data import Dataset, DataLoader
  13. from sklearn.metrics import classification_report, accuracy_score
  14. from transformers import (set_seed,
  15.                           TrainingArguments,
  16.                           Trainer,
  17.                           AutoConfig,
  18.                           AutoTokenizer,
  19.                           AdamW,
  20.                           get_linear_schedule_with_warmup,
  21.                           AutoModelForSequenceClassification)
  22.  
  23. from sklearn.metrics import precision_recall_fscore_support as score
  24.  
  25.  
  26. # In[ ]:
  27.  
  28.  
  29. use_wandb = False
  30. epochs = 8
  31.  
  32.  
  33. # In[3]:
  34.  
  35.  
  36. import random
  37. import numpy as np
  38.  
  39. def set_seed(seed=12):
  40.     """Set seed for reproducibility.
  41.    """
  42.     random.seed(seed)
  43.     np.random.seed(seed)
  44.     torch.manual_seed(seed)
  45.     torch.cuda.manual_seed_all(seed)
  46.    
  47.     os.environ['PYTHONHASHSEED'] = str(seed)
  48.    
  49. set_seed()
  50.  
  51.  
  52. # In[4]:
  53.  
  54.  
  55. if use_wandb:
  56.    
  57.     import wandb
  58.  
  59.     wandb.login(key="69968957548c81fa530d32661ab316213ff08545")
  60.     wandb.init(project="multi-class-text-classifier")
  61.  
  62.  
  63.     wandb.run.name = "two-gpts"
  64.     wandb.run.save()
  65.  
  66.  
  67.     config = wandb.config
  68.     config.description = "Establishing baseline from GPT2 model"
  69.     config.model_used = "GPT2-Large"
  70.  
  71.  
  72. # In[5]:
  73.  
  74.  
  75. def get_model_tokenizer():
  76.    
  77.     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
  78.  
  79.  
  80.     model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=len(labels_ids))
  81.     model = model.to(device)
  82.    
  83.     return model, tokenizer
  84.  
  85.  
  86. # In[6]:
  87.  
  88.  
  89. class Gpt2ClassificationCollator(object):
  90.  
  91.     def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
  92.  
  93.         self.use_tokenizer = use_tokenizer
  94.         self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
  95.         self.labels_encoder = labels_encoder
  96.         return
  97.  
  98.     def __call__(self, sequences):
  99.  
  100.         texts = [sequence['text'] for sequence in sequences]
  101.         labels = [sequence['label'] for sequence in sequences]
  102.         inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
  103.         inputs.update({'labels':torch.tensor(labels)})
  104.  
  105.         return inputs
  106.  
  107.  
  108. # In[7]:
  109.  
  110.  
  111. class ClassificationDataset(Dataset):
  112.  
  113.  
  114.     def __init__(self, df):
  115.  
  116.         self.texts = df.text.values.tolist()
  117.         self.labels = df.label.values.tolist()
  118.  
  119.     def __len__(self):
  120.  
  121.         return len(self.texts)
  122.  
  123.     def __getitem__(self, item):
  124.  
  125.         return {'text':self.texts[item],
  126.             'label':self.labels[item]}
  127.  
  128.  
  129. # In[8]:
  130.  
  131.  
  132.  
  133.  
  134. def train_epoch(dataloader, optimizer_, scheduler_, device_, model):
  135.  
  136.     predictions_labels = []
  137.     true_labels = []
  138.     total_loss = 0
  139.  
  140.     model.train()
  141.  
  142.     for batch in tqdm(dataloader, total=len(dataloader)):
  143.  
  144.         true_labels += batch['labels'].numpy().flatten().tolist()
  145.         batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
  146.  
  147.         model.zero_grad()
  148.  
  149.         outputs = model(**batch)
  150.  
  151.         loss, logits = outputs[:2]
  152.  
  153.         total_loss += loss.item()
  154.  
  155.         loss.backward()
  156.  
  157.         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  158.  
  159.         optimizer.step()
  160.  
  161.         scheduler.step()
  162.  
  163.         logits = logits.detach().cpu().numpy()
  164.  
  165.         predictions_labels += logits.argmax(axis=-1).flatten().tolist()
  166.  
  167.         avg_epoch_loss = total_loss / len(dataloader)
  168.  
  169.     return true_labels, predictions_labels, avg_epoch_loss
  170.  
  171.  
  172.  
  173. def validation(dataloader, device_, model):
  174.  
  175.     predictions_labels = []
  176.     true_labels = []
  177.     total_loss = 0
  178.  
  179.     model.eval()
  180.  
  181.     for batch in tqdm(dataloader, total=len(dataloader)):
  182.  
  183.         true_labels += batch['labels'].numpy().flatten().tolist()
  184.         batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
  185.  
  186.         with torch.no_grad():        
  187.  
  188.             outputs = model(**batch)
  189.  
  190.             loss, logits = outputs[:2]
  191.             logits = logits.detach().cpu().numpy()
  192.             total_loss += loss.item()
  193.  
  194.             predict_content = logits.argmax(axis=-1).flatten().tolist()
  195.  
  196.             predictions_labels += predict_content
  197.  
  198.     avg_epoch_loss = total_loss / len(dataloader)
  199.    
  200.     return true_labels, predictions_labels, avg_epoch_loss
  201.  
  202.  
  203. # In[9]:
  204.  
  205. import argparse
  206. def train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler,  prefix=""):
  207.  
  208.     best_val_acc = 0
  209.     best_preds = None
  210.     best_labels = None
  211.  
  212.     if use_wandb:  wandb.watch(model)
  213.  
  214.  
  215.     print('Epoch')
  216.     for epoch in tqdm(range(epochs)):
  217.  
  218.         print()
  219.         print('Training on batches...')
  220.  
  221.         train_labels, train_predict, train_loss = train_epoch(train_dataloader, optimizer, scheduler, device, model)
  222.         train_acc = accuracy_score(train_labels, train_predict)
  223.  
  224.         print('Validation on batches...')
  225.         valid_labels, valid_predict, val_loss = validation(valid_dataloader, device, model)
  226.         val_acc = accuracy_score(valid_labels, valid_predict)
  227.  
  228.         if use_wandb:
  229.             #Log
  230.             wandb.log({"train_loss": train_loss})
  231.             wandb.log({"train_acc": train_acc})
  232.  
  233.             wandb.log({"test_loss": val_loss})
  234.             wandb.log({"test_acc": val_acc})
  235.  
  236.             precision, recall, fscore, support = score(valid_labels, valid_predict)
  237.  
  238.             for k, n in enumerate(labels_ids):
  239.                 wandb.log({f"precision-{n}": precision[k]})
  240.                 wandb.log({f"recall-{n}": recall[k]})
  241.                 wandb.log({f"fscore-{n}": fscore[k]})
  242.  
  243.  
  244.         if val_acc > best_val_acc:
  245.             torch.save(model, prefix+"best_model.pt")
  246.             best_val_acc = val_acc
  247.  
  248.             best_preds = valid_predict
  249.             best_labels = valid_labels
  250.  
  251.  
  252.         # Print loss and accuracy values to see how training evolves.
  253.         print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
  254.         print()
  255.  
  256.     return best_preds, best_labels, best_val_acc
  257.  
  258.  
  259. def main():
  260.     # In[10]:
  261.    
  262.     parser = argparse.ArgumentParser()
  263.     parser.add_argument("--lr", type=float)
  264.     parser.add_argument("--batch_size", type=int)
  265.  
  266.     args = parser.parse_args()
  267.  
  268.    
  269.     lr, batch_size = args.lr, args.batch_size
  270.     print("============")
  271.     print(str(lr) + " " + str(batch_size))
  272.  
  273.     train = pd.read_csv("../input/upwork02/train.csv").sample(frac=1)
  274.     test = pd.read_csv("../input/upwork02/test.csv").sample(frac=1)
  275.  
  276.  
  277.     train = train[(train.label != 'Expertise - "Teach Me"') & (train.label != "Other") & (train.label != "Business Opportunity ")]
  278.  
  279.     le = preprocessing.LabelEncoder()
  280.     le.fit(train.label)
  281.  
  282.  
  283.     train.label = le.transform(train.label)
  284.     test.label = le.transform(test.label)
  285.  
  286.     train["label_names"] = le.inverse_transform(train.label)
  287.     test["label_names"] = le.inverse_transform(test.label)
  288.  
  289.     labels_ids = dict(zip(le.classes_, le.transform(le.classes_)))
  290.  
  291.     #######################
  292.  
  293.     train_df = train
  294.     test_df = test
  295.  
  296.     model_name = "roberta-large"
  297.     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  298.     model, tokenizer = get_model_tokenizer()
  299.  
  300.  
  301.     gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer,
  302.                                                             labels_encoder=labels_ids,
  303.                                                             max_sequence_len=96)
  304.  
  305.  
  306.     train_dataset = ClassificationDataset(train_df)
  307.     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
  308.  
  309.     valid_dataset = ClassificationDataset(test_df)
  310.     valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
  311.  
  312.  
  313.     optimizer = AdamW(model.parameters(),
  314.                     lr = lr, # default is 5e-5, our notebook had 2e-5
  315.                     eps = 1e-8 # default is 1e-8.
  316.                     )
  317.  
  318.  
  319.     total_steps = len(train_dataloader) * epochs
  320.  
  321.     scheduler = get_linear_schedule_with_warmup(optimizer,
  322.                                                 num_warmup_steps = 0, # Default value in run_glue.py
  323.                                                 num_training_steps = total_steps)
  324.  
  325.     best_preds, best_labels, best_val_acc = train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler,  prefix="binary_")
  326.  
  327.  
  328.     print(best_val_acc)
  329.  
  330.  
  331. main()
RAW Paste Data