Advertisement
maxpower51

Untitled

Mar 3rd, 2021
454
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.63 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import datasets
  4. import json
  5. import torch
  6. from sklearn import metrics
  7. from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
  8. from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
  9. from transformers import BertModel, BertTokenizer
  10. from sklearn.model_selection import train_test_split
  11. from datasets import Dataset
  12. from torch import cuda
  13. device = 'cuda' if cuda.is_available() else 'cpu'
  14.  
  15. MODEL_NAME = 'dbmdz/bert-base-german-uncased'
  16. SEED = 321
  17.  
  18.  
  19. def compute_metrics_multilables_b(eval_pred):
  20.     predictions, labels = eval_pred
  21.     predictions = torch.tensor(predictions)
  22.     preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist()
  23.    
  24.     preds_full = np.array(preds_full) >= 0.5
  25.     labels = np.array(labels) >= 0.5
  26.    
  27.     accuracy = metrics.accuracy_score(labels, preds_full)
  28.     f1_score_micro = metrics.f1_score(labels, preds_full, average='micro')
  29.     f1_score_macro = metrics.f1_score(labels, preds_full, average='macro')
  30.     metrics_result = {
  31.                 'accuracy': accuracy,
  32.                 'f1_micro': f1_score_micro,
  33.                 'f1_macro': f1_score_macro,
  34.     }
  35.    
  36.     return metrics_result
  37.  
  38. class EmotionDataset(torch.utils.data.Dataset):
  39.     def __init__(self, encodings, labels):
  40.         self.encodings = encodings
  41.         self.labels = labels
  42.  
  43.     def __getitem__(self, idx):
  44.         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  45.         item['labels'] = torch.tensor(self.labels[idx])
  46.         return item
  47.  
  48.     def __len__(self):
  49.         return len(self.labels)
  50.  
  51. class CustomTrainer(Trainer):
  52.     def compute_loss(self, model, inputs):
  53.         labels = inputs.pop("labels")
  54.         outputs = model(inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'])
  55.         labels = labels.type_as(outputs)
  56.         logits = outputs
  57.         return torch.nn.BCEWithLogitsLoss()(logits, labels)
  58.        
  59. class MultiLabelClassifier(torch.nn.Module):
  60.     def __init__(self):
  61.         super(MultiLabelClassifier, self).__init__()
  62.         self.l1 = BertModel.from_pretrained(MODEL_NAME)
  63.         self.l2 = torch.nn.Dropout(0.3)
  64.         # output is a 8-dim vector
  65.         self.l3 = torch.nn.Linear(768, 8)
  66.    
  67.     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
  68.                 head_mask=None, inputs_embeds=None, labels=None, output_attentions=None,
  69.                 output_hidden_states=None, return_dict=None):
  70.         output_1 = self.l1(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids).pooler_output
  71.         output_2 = self.l2(output_1)
  72.         output = self.l3(output_2)
  73.         return output
  74.  
  75.  
  76. dataset_train = Dataset.from_pandas(df_train)
  77. dataset_validation = Dataset.from_pandas(df_validation)
  78. dataset_test = Dataset.from_pandas(df_test)
  79.  
  80.  
  81. ### load model and tokenizer
  82. tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
  83. model = BertModel.from_pretrained(MODEL_NAME)
  84.  
  85. ### preprocess data
  86. field_text = "Text"
  87. field_label = "list"
  88.  
  89. ### tokenize data
  90. train_encodings = tokenizer(dataset_train[field_text], truncation=True, padding=True)
  91. val_encodings = tokenizer(dataset_validation[field_text], truncation=True, padding=True)
  92. test_encodings = tokenizer(dataset_test[field_text], truncation=True, padding=True)
  93.  
  94. train_dataset = EmotionDataset(train_encodings, dataset_train[field_label])
  95. val_dataset = EmotionDataset(val_encodings, dataset_validation[field_label])
  96. test_dataset = EmotionDataset(test_encodings, dataset_test[field_label])
  97.  
  98.  
  99.  
  100. model = MultiLabelClassifier()
  101. _  = model.to(device)
  102.  
  103.  
  104.  
  105.    
  106. training_args = TrainingArguments(
  107.     output_dir='./results',          # output directory
  108.     num_train_epochs=1,              # total # of training epochs
  109.     per_device_train_batch_size=8,  # batch size per device during training
  110.     per_device_eval_batch_size=20,   # batch size for evaluation
  111.     warmup_steps=500,                # number of warmup steps for learning rate scheduler
  112.     weight_decay=0.01,               # strength of weight decay
  113.     logging_dir='./logs',            # directory for storing logs
  114. )
  115.  
  116. trainer = CustomTrainer(
  117.     model=model,                         # the instantiated 🤗 Transformers model to be trained
  118.     args=training_args,                  # training arguments, defined above
  119.     train_dataset=train_dataset,         # training dataset
  120.     eval_dataset=test_dataset,            # evaluation dataset
  121.     compute_metrics=compute_metrics_multilables_b
  122. )
  123.  
  124. _ = trainer.train()
  125. trainer.evaluate()
  126.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement