Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import datasets
- import json
- import torch
- from sklearn import metrics
- from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
- from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
- from transformers import BertModel, BertTokenizer
- from sklearn.model_selection import train_test_split
- from datasets import Dataset
- from torch import cuda
- device = 'cuda' if cuda.is_available() else 'cpu'
- MODEL_NAME = 'dbmdz/bert-base-german-uncased'
- SEED = 321
- def compute_metrics_multilables_b(eval_pred):
- predictions, labels = eval_pred
- predictions = torch.tensor(predictions)
- preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist()
- preds_full = np.array(preds_full) >= 0.5
- labels = np.array(labels) >= 0.5
- accuracy = metrics.accuracy_score(labels, preds_full)
- f1_score_micro = metrics.f1_score(labels, preds_full, average='micro')
- f1_score_macro = metrics.f1_score(labels, preds_full, average='macro')
- metrics_result = {
- 'accuracy': accuracy,
- 'f1_micro': f1_score_micro,
- 'f1_macro': f1_score_macro,
- }
- return metrics_result
- class EmotionDataset(torch.utils.data.Dataset):
- def __init__(self, encodings, labels):
- self.encodings = encodings
- self.labels = labels
- def __getitem__(self, idx):
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
- item['labels'] = torch.tensor(self.labels[idx])
- return item
- def __len__(self):
- return len(self.labels)
- class CustomTrainer(Trainer):
- def compute_loss(self, model, inputs):
- labels = inputs.pop("labels")
- outputs = model(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'])
- labels = labels.type_as(outputs)
- logits = outputs
- return torch.nn.BCEWithLogitsLoss()(logits, labels)
- class MultiLabelClassifier(torch.nn.Module):
- def __init__(self):
- super(MultiLabelClassifier, self).__init__()
- self.l1 = BertModel.from_pretrained(MODEL_NAME)
- self.l2 = torch.nn.Dropout(0.3)
- # output is a 8-dim vector
- self.l3 = torch.nn.Linear(768, 8)
- def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
- head_mask=None, inputs_embeds=None, labels=None, output_attentions=None,
- output_hidden_states=None, return_dict=None):
- output_1 = self.l1(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids).pooler_output
- output_2 = self.l2(output_1)
- output = self.l3(output_2)
- return output
- dataset_train = Dataset.from_pandas(df_train)
- dataset_validation = Dataset.from_pandas(df_validation)
- dataset_test = Dataset.from_pandas(df_test)
- ### load model and tokenizer
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
- model = BertModel.from_pretrained(MODEL_NAME)
- ### preprocess data
- field_text = "Text"
- field_label = "list"
- ### tokenize data
- train_encodings = tokenizer(dataset_train[field_text], truncation=True, padding=True)
- val_encodings = tokenizer(dataset_validation[field_text], truncation=True, padding=True)
- test_encodings = tokenizer(dataset_test[field_text], truncation=True, padding=True)
- train_dataset = EmotionDataset(train_encodings, dataset_train[field_label])
- val_dataset = EmotionDataset(val_encodings, dataset_validation[field_label])
- test_dataset = EmotionDataset(test_encodings, dataset_test[field_label])
- model = MultiLabelClassifier()
- _ = model.to(device)
- training_args = TrainingArguments(
- output_dir='./results', # output directory
- num_train_epochs=1, # total # of training epochs
- per_device_train_batch_size=8, # batch size per device during training
- per_device_eval_batch_size=20, # batch size for evaluation
- warmup_steps=500, # number of warmup steps for learning rate scheduler
- weight_decay=0.01, # strength of weight decay
- logging_dir='./logs', # directory for storing logs
- )
- trainer = CustomTrainer(
- model=model, # the instantiated 🤗 Transformers model to be trained
- args=training_args, # training arguments, defined above
- train_dataset=train_dataset, # training dataset
- eval_dataset=test_dataset, # evaluation dataset
- compute_metrics=compute_metrics_multilables_b
- )
- _ = trainer.train()
- trainer.evaluate()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement