Alpaca_format_score_PairRM-hf_QA.py

# B.2 Smart dedup by score - slow : Modify from https://pastebin.com/qaGG7NSM
import json, ijson
import torch
from transformers import AutoTokenizer
from tqdm import tqdm
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM # pip install git+https://github.com/yuchenlin/LLM-Blender.git

reward_name = "llm-blender/PairRM-hf"
rank_model = DebertaV2PairRM.from_pretrained(reward_name, device_map="cuda:0").eval()
tokenizer = AutoTokenizer.from_pretrained(reward_name)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
rank_model.to(device)

DEBUG_SCORE = False  # Include the score in the output
SCORE_THRESHOLD = 0.01  # Usefulness minimum score to keep

def score_object(obj):
    question = "" + obj['instruction'] + " " + obj['input']
    answer = "" + obj['output'] + ""
    inputs = tokenizer(question + answer, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = rank_model(**inputs)
        score = outputs.logits.item()  # Get the single score

    if DEBUG_SCORE:
        score_str = f"{score:.6f}"
        return {**obj, 'score': score_str}, score
    else:
        return obj, score

def score_and_sort_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8-sig') as f:
        objects = ijson.items(f, 'item')

        scored_objects = [score_object(obj) for obj in tqdm(objects, desc="Scoring data")] # Score objects in a list

        filtered_objects = []
        for obj, score in scored_objects:
            if score > SCORE_THRESHOLD:
                filtered_objects.append(obj)

        filtered_objects.sort(key=lambda x: x.get('score', 0) if DEBUG_SCORE else score, reverse=True) # Sort by score

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_objects, f, ensure_ascii=False, indent=2)

score_and_sort_data('en_output.json', 'en_output_scoreQA.json')