Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[ ]:
- import io
- import os
- import torch
- from tqdm.notebook import tqdm
- from torch.utils.data import Dataset, DataLoader
- from sklearn.metrics import classification_report, accuracy_score
- from transformers import (set_seed,
- TrainingArguments,
- Trainer,
- AutoConfig,
- AutoTokenizer,
- AdamW,
- get_linear_schedule_with_warmup,
- AutoModelForSequenceClassification)
- from sklearn.metrics import precision_recall_fscore_support as score
- # In[ ]:
- use_wandb = False
- epochs = 8
- # In[3]:
- import random
- import numpy as np
- def set_seed(seed=12):
- """Set seed for reproducibility.
- """
- random.seed(seed)
- np.random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
- os.environ['PYTHONHASHSEED'] = str(seed)
- set_seed()
- # In[4]:
- if use_wandb:
- import wandb
- wandb.login(key="69968957548c81fa530d32661ab316213ff08545")
- wandb.init(project="multi-class-text-classifier")
- wandb.run.name = "two-gpts"
- wandb.run.save()
- config = wandb.config
- config.description = "Establishing baseline from GPT2 model"
- config.model_used = "GPT2-Large"
- # In[5]:
- def get_model_tokenizer():
- tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
- model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=len(labels_ids))
- model = model.to(device)
- return model, tokenizer
- # In[6]:
- class Gpt2ClassificationCollator(object):
- def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):
- self.use_tokenizer = use_tokenizer
- self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len
- self.labels_encoder = labels_encoder
- return
- def __call__(self, sequences):
- texts = [sequence['text'] for sequence in sequences]
- labels = [sequence['label'] for sequence in sequences]
- inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=self.max_sequence_len)
- inputs.update({'labels':torch.tensor(labels)})
- return inputs
- # In[7]:
- class ClassificationDataset(Dataset):
- def __init__(self, df):
- self.texts = df.text.values.tolist()
- self.labels = df.label.values.tolist()
- def __len__(self):
- return len(self.texts)
- def __getitem__(self, item):
- return {'text':self.texts[item],
- 'label':self.labels[item]}
- # In[8]:
- def train_epoch(dataloader, optimizer_, scheduler_, device_, model):
- predictions_labels = []
- true_labels = []
- total_loss = 0
- model.train()
- for batch in tqdm(dataloader, total=len(dataloader)):
- true_labels += batch['labels'].numpy().flatten().tolist()
- batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
- model.zero_grad()
- outputs = model(**batch)
- loss, logits = outputs[:2]
- total_loss += loss.item()
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
- optimizer.step()
- scheduler.step()
- logits = logits.detach().cpu().numpy()
- predictions_labels += logits.argmax(axis=-1).flatten().tolist()
- avg_epoch_loss = total_loss / len(dataloader)
- return true_labels, predictions_labels, avg_epoch_loss
- def validation(dataloader, device_, model):
- predictions_labels = []
- true_labels = []
- total_loss = 0
- model.eval()
- for batch in tqdm(dataloader, total=len(dataloader)):
- true_labels += batch['labels'].numpy().flatten().tolist()
- batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
- with torch.no_grad():
- outputs = model(**batch)
- loss, logits = outputs[:2]
- logits = logits.detach().cpu().numpy()
- total_loss += loss.item()
- predict_content = logits.argmax(axis=-1).flatten().tolist()
- predictions_labels += predict_content
- avg_epoch_loss = total_loss / len(dataloader)
- return true_labels, predictions_labels, avg_epoch_loss
- # In[9]:
- import argparse
- def train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler, prefix=""):
- best_val_acc = 0
- best_preds = None
- best_labels = None
- if use_wandb: wandb.watch(model)
- print('Epoch')
- for epoch in tqdm(range(epochs)):
- print()
- print('Training on batches...')
- train_labels, train_predict, train_loss = train_epoch(train_dataloader, optimizer, scheduler, device, model)
- train_acc = accuracy_score(train_labels, train_predict)
- print('Validation on batches...')
- valid_labels, valid_predict, val_loss = validation(valid_dataloader, device, model)
- val_acc = accuracy_score(valid_labels, valid_predict)
- if use_wandb:
- #Log
- wandb.log({"train_loss": train_loss})
- wandb.log({"train_acc": train_acc})
- wandb.log({"test_loss": val_loss})
- wandb.log({"test_acc": val_acc})
- precision, recall, fscore, support = score(valid_labels, valid_predict)
- for k, n in enumerate(labels_ids):
- wandb.log({f"precision-{n}": precision[k]})
- wandb.log({f"recall-{n}": recall[k]})
- wandb.log({f"fscore-{n}": fscore[k]})
- if val_acc > best_val_acc:
- torch.save(model, prefix+"best_model.pt")
- best_val_acc = val_acc
- best_preds = valid_predict
- best_labels = valid_labels
- # Print loss and accuracy values to see how training evolves.
- print(" train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
- print()
- return best_preds, best_labels, best_val_acc
- def main():
- # In[10]:
- parser = argparse.ArgumentParser()
- parser.add_argument("--lr", type=float)
- parser.add_argument("--batch_size", type=int)
- args = parser.parse_args()
- lr, batch_size = args.lr, args.batch_size
- print("============")
- print(str(lr) + " " + str(batch_size))
- train = pd.read_csv("../input/upwork02/train.csv").sample(frac=1)
- test = pd.read_csv("../input/upwork02/test.csv").sample(frac=1)
- train = train[(train.label != 'Expertise - "Teach Me"') & (train.label != "Other") & (train.label != "Business Opportunity ")]
- le = preprocessing.LabelEncoder()
- le.fit(train.label)
- train.label = le.transform(train.label)
- test.label = le.transform(test.label)
- train["label_names"] = le.inverse_transform(train.label)
- test["label_names"] = le.inverse_transform(test.label)
- labels_ids = dict(zip(le.classes_, le.transform(le.classes_)))
- #######################
- train_df = train
- test_df = test
- model_name = "roberta-large"
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
- model, tokenizer = get_model_tokenizer()
- gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer,
- labels_encoder=labels_ids,
- max_sequence_len=96)
- train_dataset = ClassificationDataset(train_df)
- train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
- valid_dataset = ClassificationDataset(test_df)
- valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
- optimizer = AdamW(model.parameters(),
- lr = lr, # default is 5e-5, our notebook had 2e-5
- eps = 1e-8 # default is 1e-8.
- )
- total_steps = len(train_dataloader) * epochs
- scheduler = get_linear_schedule_with_warmup(optimizer,
- num_warmup_steps = 0, # Default value in run_glue.py
- num_training_steps = total_steps)
- best_preds, best_labels, best_val_acc = train_model(model, epochs, train_dataloader, valid_dataloader, device, scheduler, prefix="binary_")
- print(best_val_acc)
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement