Maroxtn

Untitled

Jun 17th, 2021
782
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[ ]:
  5.  
  6.  
  7. import pandas as pd
  8. from sklearn import preprocessing
  9.  
  10. model_name = "roberta-large"
  11.  
  12. import io
  13. import os
  14. import torch
  15. from tqdm.notebook import tqdm
  16. from torch.utils.data import Dataset, DataLoader
  17. from sklearn.metrics import classification_report, accuracy_score
  18. from transformers import (set_seed,
  19.                           TrainingArguments,
  20.                           Trainer,
  21.                           AutoConfig,
  22.                           AutoTokenizer,
  23.                           AdamW,
  24.                           get_linear_schedule_with_warmup,
  25.                           AutoModelForSequenceClassification)
  26.  
  27. from sklearn.metrics import precision_recall_fscore_support as score
  28.  
  29.  
  30. # In[ ]:
  31.  
  32.  
  33. use_wandb = False
  34. epochs = 8
  35.  
  36.  
  37. # In[3]:
  38.  
  39.  
  40. import random
  41. import numpy as np
  42.  
  43. def set_seed(seed=12):
  44.     """Set seed for reproducibility.
  45.    """
  46.     random.seed(seed)
  47.     np.random.seed(seed)
  48.     torch.manual_seed(seed)
  49.     torch.cuda.manual_seed_all(seed)
  50.    
  51.     os.environ['PYTHONHASHSEED'] = str(seed)
  52.    
  53. set_seed()
  54.  
  55.  
  56. # In[4]:
  57.  
  58.  
  59. if use_wandb:
  60.    
  61.     import wandb
  62.  
  63.     wandb.login(key="69968957548c81fa530d32661ab316213ff08545")
  64.     wandb.init(project="multi-class-text-classifier")
  65.  
  66.  
  67.     wandb.run.name = "two-gpts"
  68.     wandb.run.save()
  69.  
  70.  
  71.     config = wandb.config
  72.     config.description = "Establishing baseline from GPT2 model"
  73.     config.model_used = "GPT2-Large"
  74.  
  75.  
  76. # In[5]:
  77.  
  78.  
  79. def get_model_tokenizer():
  80.    
  81.     tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
  82.  
  83.  
  84.     model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=len(labels_ids))
  85.     model = model.to(device)
  86.    
  87.     return model, tokenizer
  88.  
  89.  
  90. # In[6]:
  91.  
  92.  
  93. class Gpt2ClassificationCollator(object):
  94.  
  95.     def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
  96.  
  97.         self.use_tokenizer = use_tokenizer
  98.         self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
  99.         self.labels_encoder = labels_encoder
  100.         return
  101.  
  102.     def __call__(self, sequences):
  103.  
  104.         texts = [sequence['text'] for sequence in sequences]
  105.         labels = [sequence['label'] for sequence in sequences]
  106.         inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
  107.         inputs.update({'labels':torch.tensor(labels)})
  108.  
  109.         return inputs
  110.  
  111.  
  112. # In[7]:
  113.  
  114.  
  115. class ClassificationDataset(Dataset):
  116.  
  117.  
  118.     def __init__(self, df):
  119.  
  120.         self.texts = df.text.values.tolist()
  121.         self.labels = df.label.values.tolist()
  122.  
  123.     def __len__(self):
  124.  
  125.         return len(self.texts)
  126.  
  127.     def __getitem__(self, item):
  128.  
  129.         return {'text':self.texts[item],
  130.             'label':self.labels[item]}
  131.  
  132.  
  133. # In[8]:
  134.  
  135.  
  136.  
  137.  
  138. def train_epoch(dataloader, optimizer_, scheduler_, device_, model):
  139.  
  140.     predictions_labels = []
  141.     true_labels = []
  142.     total_loss = 0
  143.  
  144.     model.train()
  145.  
  146.     for batch in tqdm(dataloader, total=len(dataloader)):
  147.  
  148.         true_labels += batch['labels'].numpy().flatten().tolist()
  149.         batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
  150.  
  151.         model.zero_grad()
  152.  
  153.         outputs = model(**batch)
  154.  
  155.         loss, logits = outputs[:2]
  156.  
  157.         total_loss += loss.item()
  158.  
  159.         loss.backward()
  160.  
  161.         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  162.  
  163.         optimizer.step()
  164.  
  165.         scheduler.step()
  166.  
  167.         logits = logits.detach().cpu().numpy()
  168.  
  169.         predictions_labels += logits.argmax(axis=-1).flatten().tolist()
  170.  
  171.         avg_epoch_loss = total_loss / len(dataloader)
  172.  
  173.     return true_labels, predictions_labels, avg_epoch_loss
  174.  
  175.  
  176.  
  177. def validation(dataloader, device_, model):
  178.  
  179.     predictions_labels = []
  180.     true_labels = []
  181.     total_loss = 0
  182.  
  183.     model.eval()
  184.  
  185.     for batch in tqdm(dataloader, total=len(dataloader)):
  186.  
  187.         true_labels += batch['labels'].numpy().flatten().tolist()
  188.         batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
  189.  
  190.         with torch.no_grad():        
  191.  
  192.             outputs = model(**batch)
  193.  
  194.             loss, logits = outputs[:2]
  195.             logits = logits.detach().cpu().numpy()
  196.             total_loss += loss.item()
  197.  
  198.             predict_content = logits.argmax(axis=-1).flatten().tolist()
  199.  
  200.             predictions_labels += predict_content
  201.  
  202.     avg_epoch_loss = total_loss / len(dataloader)
  203.    
  204.     return true_labels, predictions_labels, avg_epoch_loss
  205.  
  206.  
  207. # In[9]:
  208.  
  209. import argparse
  210. def train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler,  prefix=""):
  211.  
  212.     best_val_acc = 0
  213.     best_preds = None
  214.     best_labels = None
  215.  
  216.     if use_wandb:  wandb.watch(model)
  217.  
  218.  
  219.     print('Epoch')
  220.     for epoch in tqdm(range(epochs)):
  221.  
  222.         print()
  223.         print('Training on batches...')
  224.  
  225.         train_labels, train_predict, train_loss = train_epoch(train_dataloader, optimizer, scheduler, device, model)
  226.         train_acc = accuracy_score(train_labels, train_predict)
  227.  
  228.         print('Validation on batches...')
  229.         valid_labels, valid_predict, val_loss = validation(valid_dataloader, device, model)
  230.         val_acc = accuracy_score(valid_labels, valid_predict)
  231.  
  232.         if use_wandb:
  233.             #Log
  234.             wandb.log({"train_loss": train_loss})
  235.             wandb.log({"train_acc": train_acc})
  236.  
  237.             wandb.log({"test_loss": val_loss})
  238.             wandb.log({"test_acc": val_acc})
  239.  
  240.             precision, recall, fscore, support = score(valid_labels, valid_predict)
  241.  
  242.             for k, n in enumerate(labels_ids):
  243.                 wandb.log({f"precision-{n}": precision[k]})
  244.                 wandb.log({f"recall-{n}": recall[k]})
  245.                 wandb.log({f"fscore-{n}": fscore[k]})
  246.  
  247.  
  248.         if val_acc > best_val_acc:
  249.             torch.save(model, prefix+"best_model.pt")
  250.             best_val_acc = val_acc
  251.  
  252.             best_preds = valid_predict
  253.             best_labels = valid_labels
  254.  
  255.  
  256.         # Print loss and accuracy values to see how training evolves.
  257.         print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
  258.         print()
  259.  
  260.     return best_preds, best_labels, best_val_acc
  261.  
  262.  
  263. def main():
  264.     # In[10]:
  265.    
  266.     parser = argparse.ArgumentParser()
  267.     parser.add_argument("--lr", type=float)
  268.     parser.add_argument("--batch_size", type=int)
  269.  
  270.     args = parser.parse_args()
  271.  
  272.    
  273.     lr, batch_size = args.lr, args.batch_size
  274.     print("============")
  275.     print(str(lr) + " " + str(batch_size))
  276.  
  277.     train = pd.read_csv("../input/upwork02/train.csv").sample(frac=1)
  278.     test = pd.read_csv("../input/upwork02/test.csv").sample(frac=1)
  279.  
  280.  
  281.     train = train[(train.label != 'Expertise - "Teach Me"') & (train.label != "Other") & (train.label != "Business Opportunity ")]
  282.  
  283.     le = preprocessing.LabelEncoder()
  284.     le.fit(train.label)
  285.  
  286.  
  287.     train.label = le.transform(train.label)
  288.     test.label = le.transform(test.label)
  289.  
  290.     train["label_names"] = le.inverse_transform(train.label)
  291.     test["label_names"] = le.inverse_transform(test.label)
  292.  
  293.     labels_ids = dict(zip(le.classes_, le.transform(le.classes_)))
  294.  
  295.     #######################
  296.  
  297.     train_df = train
  298.     test_df = test
  299.  
  300.     model_name = "roberta-large"
  301.     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  302.     model, tokenizer = get_model_tokenizer()
  303.  
  304.  
  305.     gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer,
  306.                                                             labels_encoder=labels_ids,
  307.                                                             max_sequence_len=96)
  308.  
  309.  
  310.     train_dataset = ClassificationDataset(train_df)
  311.     train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
  312.  
  313.     valid_dataset = ClassificationDataset(test_df)
  314.     valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
  315.  
  316.  
  317.     optimizer = AdamW(model.parameters(),
  318.                     lr = lr, # default is 5e-5, our notebook had 2e-5
  319.                     eps = 1e-8 # default is 1e-8.
  320.                     )
  321.  
  322.  
  323.     total_steps = len(train_dataloader) * epochs
  324.  
  325.     scheduler = get_linear_schedule_with_warmup(optimizer,
  326.                                                 num_warmup_steps = 0, # Default value in run_glue.py
  327.                                                 num_training_steps = total_steps)
  328.  
  329.     best_preds, best_labels, best_val_acc = train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler,  prefix="binary_")
  330.  
  331.  
  332.     print(best_val_acc)
  333.  
  334.  
  335. main()
RAW Paste Data