Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # B.2 Smart dedup by score - slow : Modify from https://pastebin.com/qaGG7NSM
- import json, ijson
- import torch
- from transformers import AutoTokenizer
- from tqdm import tqdm
- from llm_blender.pair_ranker.pairrm import DebertaV2PairRM # pip install git+https://github.com/yuchenlin/LLM-Blender.git
- reward_name = "llm-blender/PairRM-hf"
- rank_model = DebertaV2PairRM.from_pretrained(reward_name, device_map="cuda:0").eval()
- tokenizer = AutoTokenizer.from_pretrained(reward_name)
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
- rank_model.to(device)
- DEBUG_SCORE = False # Include the score in the output
- SCORE_THRESHOLD = 0.01 # Usefulness minimum score to keep
- def score_object(obj):
- question = "" + obj['instruction'] + " " + obj['input']
- answer = "" + obj['output'] + ""
- inputs = tokenizer(question + answer, return_tensors='pt').to(device)
- with torch.no_grad():
- outputs = rank_model(**inputs)
- score = outputs.logits.item() # Get the single score
- if DEBUG_SCORE:
- score_str = f"{score:.6f}"
- return {**obj, 'score': score_str}, score
- else:
- return obj, score
- def score_and_sort_data(input_file, output_file):
- with open(input_file, 'r', encoding='utf-8-sig') as f:
- objects = ijson.items(f, 'item')
- scored_objects = [score_object(obj) for obj in tqdm(objects, desc="Scoring data")] # Score objects in a list
- filtered_objects = []
- for obj, score in scored_objects:
- if score > SCORE_THRESHOLD:
- filtered_objects.append(obj)
- filtered_objects.sort(key=lambda x: x.get('score', 0) if DEBUG_SCORE else score, reverse=True) # Sort by score
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(filtered_objects, f, ensure_ascii=False, indent=2)
- score_and_sort_data('en_output.json', 'en_output_scoreQA.json')
Advertisement
Add Comment
Please, Sign In to add comment