Cipher404

Untitled

Oct 24th, 2025
290
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.60 KB | None | 0 0
  1. import kagglehub
  2. from kagglehub import KaggleDatasetAdapter
  3. import pandas as pd
  4. import json
  5. import torch
  6. from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
  7. from sklearn.metrics import f1_score
  8. import numpy as np
  9. from datasets import Dataset
  10.  
  11. # Set random seed for reproducibility
  12. torch.manual_seed(42)
  13. np.random.seed(42)
  14.  
  15. # 1. Load Datasets
  16. train_file_path = "train.jsonl"  # Update if different (e.g., for Subtask A)
  17. test_file_path = "test.jsonl"    # Update if different
  18. label_map_file = "label_to_id.json"  # Update if different
  19.  
  20. # Load training and test data
  21. train_df = kagglehub.load_dataset(
  22.     KaggleDatasetAdapter.PANDAS,
  23.     "daniilor/semeval-2026-task13",
  24.     train_file_path
  25. )
  26. test_df = kagglehub.load_dataset(
  27.     KaggleDatasetAdapter.PANDAS,
  28.     "daniilor/semeval-2026-task13",
  29.     test_file_path
  30. )
  31.  
  32. # Load label mappings
  33. label_to_id_path = kagglehub.dataset_download("daniilor/semeval-2026-task13", label_map_file)
  34. with open(label_to_id_path, 'r', encoding='utf-8') as f:
  35.     label_to_id = json.load(f)
  36. id_to_label = {v: k for k, v in label_to_id.items()}
  37.  
  38. # Verify data
  39. print("Training Data (first 5 records):")
  40. print(train_df.head())
  41. print("\nTest Data (first 5 records):")
  42. print(test_df.head())
  43. print("\nLabel Mappings:", label_to_id)
  44.  
  45. # 2. Preprocess Data
  46. # Initialize GraphCodeBERT tokenizer
  47. tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
  48.  
  49. def tokenize_function(examples):
  50.     return tokenizer(examples['code'], padding="max_length", truncation=True, max_length=512)
  51.  
  52. # Convert to Hugging Face Dataset
  53. train_dataset = Dataset.from_pandas(train_df[['code', 'label']])
  54. test_dataset = Dataset.from_pandas(test_df[['id', 'code']])
  55.  
  56. # Tokenize datasets
  57. train_dataset = train_dataset.map(tokenize_function, batched=True)
  58. test_dataset = test_dataset.map(tokenize_function, batched=True)
  59.  
  60. # Set format for PyTorch
  61. train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
  62. test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'id'])
  63.  
  64. # Split training data for validation
  65. train_dataset, val_dataset = train_dataset.train_test_split(test_size=0.1, seed=42).values()
  66.  
  67. # 3. Define Model
  68. model = AutoModelForSequenceClassification.from_pretrained(
  69.     "microsoft/graphcodebert-base",
  70.     num_labels=2  # Binary classification for Subtask A
  71. )
  72.  
  73. # 4. Define Metrics
  74. def compute_metrics(pred):
  75.     labels = pred.label_ids
  76.     preds = pred.predictions.argmax(-1)
  77.     macro_f1 = f1_score(labels, preds, average='macro')
  78.     return {"macro_f1": macro_f1}
  79.  
  80. # 5. Set Training Arguments
  81. training_args = TrainingArguments(
  82.     output_dir="./graphcodebert_finetuned",
  83.     evaluation_strategy="epoch",
  84.     learning_rate=2e-5,
  85.     per_device_train_batch_size=16,
  86.     per_device_eval_batch_size=16,
  87.     num_train_epochs=3,
  88.     weight_decay=0.01,
  89.     save_strategy="epoch",
  90.     load_best_model_at_end=True,
  91.     metric_for_best_model="macro_f1",
  92.     seed=42
  93. )
  94.  
  95. # 6. Initialize Trainer
  96. trainer = Trainer(
  97.     model=model,
  98.     args=training_args,
  99.     train_dataset=train_dataset,
  100.     eval_dataset=val_dataset,
  101.     compute_metrics=compute_metrics
  102. )
  103.  
  104. # 7. Train Model
  105. trainer.train()
  106.  
  107. # 8. Generate Predictions
  108. predictions = trainer.predict(test_dataset)
  109. pred_labels = np.argmax(predictions.predictions, axis=1)
  110.  
  111. # 9. Prepare Submission File
  112. submission_df = pd.DataFrame({
  113.     'id': test_dataset['id'],
  114.     'label': pred_labels
  115. })
  116. submission_df.to_csv('submission_graphcodebert.csv', index=False)
  117. print("Submission file saved as 'submission_graphcodebert.csv'")
Advertisement
Add Comment
Please, Sign In to add comment