cleanDatasetNPair.py

import torch, os, json, pandas as pd, numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from sacrebleu.metrics import BLEU
from safetensors import safe_open
from safetensors.torch import save_file

from g4f.client import Client
client = Client()
def completionsAPI(text):
    response = client.chat.completions.create(
        model="dolphin-mixtral-8x7b", # claude_v2, gemini-pro, gpt-4-turbo, gpt-4-32k, dolphin-mixtral-8x7b, pi
        messages=[{"role": "user", "content": text}]
    )
    return response.choices[0].message.content

import requests, time
from tqdm import tqdm

def load_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_dataset(file_path, dataset):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

def completions(text): # local text webui
    response = requests.post("http://127.0.0.1:5000/v1/completions", headers={"Content-Type": "application/json"}, data=json.dumps({"prompt": text, "max_tokens": 200, "temperature": 1, "top_p": 0.9, "seed": 10}))
    return response.json()['choices'][0]['text']

def print_obj(index, file_path): # Debug
    d = json.load(open(file_path, 'r', encoding='utf-8'))
    print(d['input_ids'][index] + ' |||| ' + d['labels'][index])

def create_summary():
    file_path = 'dataset_CLIP.json'
    before_text = 'you are expert in the field of write summary for many years. Construct efficient, well-structured, objectively optimized summaries of TEXT, distilling the core message while avoiding unnecessary, obvious, or irrelevant factual information with precision and brevity. Emphasize main essential ideas while omitting non-critical descriptors and elements. Concisely capture meaningful tags, entities, and core arguments. Let\'s rationally think step by step through this carefully, In this text as Example it show any enchance word "best quality, masterpiece,detailed,detailed face" are not usful and all sentence of "windy, floating hair, snowy" are related to winter and is not usful to keep and as for "trees" is background element, do not forget to keep it JSON Valid and be very concise and short number of words. Example: \ntext = "best quality, masterpiece, White hair,detailed, red eyes, windy, floating hair, snowy, upper body, detailed face, winter, trees, sunshine"\nsummary = "White hair, red eyes, winter in sunshine"\n\n Please summary the following: text ="'
    after_text = '"\nsummary = "'

    dataset = load_dataset(file_path)
    start_index = next((i for i, x in enumerate(dataset['labels']) if x == ""), 0)  # start by first empty label
    for i in tqdm(range(start_index, len(dataset['input_ids'])), desc="Processing dataset"):
        text = before_text + dataset['input_ids'][i] + after_text
        dataset['labels'][i] = completionsAPI(text) # completions() or..API() - Write Empty Labels
        save_dataset(file_path, dataset)
        time.sleep(3)


def get_unhelpful_list(): # https://github.com/h2oai/h2ogpt/blob/main/src/create_data.py
    unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
                 "I'm sorry, but I don't understand your question. Could you please rephrase it?",
                 "I'm sorry, I don't quite understand your question",
                 "I'm sorry, I don't know",
                 "I'm sorry, but I don't know",
                 "I don't know anything",
                 "I do not know",
                 "I don't know",
                 "I don't know how",
                 "I do not know how",
                 "Can you please explain what you mean",
                 "please explain what you mean",
                 "please explain",
                 "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by",
                 "I'm sorry but I don't understand what you mean",
                 "I don't understand",
                 "I don't have the ability",
                 "I do not have the ability",
                 "I do not have",
                 "I am a language model,",
                 "I am a large language model,",
                 "I do not understand your question. Can you please try to make it clearer?",
                 "I'm sorry, but as an AI language model",
                 "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.",
                 "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?",
                 "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t",
                 "I apologize, but I cannot perform the task you have requested.",
                 "I'm sorry, I cannot perform this task as I am an AI language model and do not have access",
                 "I'm sorry, I'm not sure what you're asking for here.",
                 "I'm not sure what you are asking",
                 "You need to provide more context",
                 ]
    # reduced versions, with redundant parts, just to give context for where they came from
    unhelpful += ["sorry, I didn't quite understand your question",
                  "I didn't quite understand your question",
                  "I didn't understand your question",
                  "I did not understand your question",
                  "I did not understand the question",
                  "could you please rephrase"
                  "could you rephrase"
                  "I do not understand your question.",
                  "I do not understand the question.",
                  "I do not understand that question.",
                  "Can you please try to make it clearer",
                  "Can you try to make it clearer",
                  "sorry, but as an AI language model",
                  "as an AI language model",
                  "I apologize, but I cannot",
                  "I cannot rephrase text",
                  "I cannot understand. Your post is difficult to read and follow."
                  "Your post is difficult to read and follow."
                  "I apologize, but I am",
                  "Sorry, but I am not ",
                  "nor am I capable",
                  "I am not capable of",
                  "I apologize, but I cannot perform the task you have requested",
                  "I cannot perform the task",
                  "I cannot complete the task",
                  "I'm sorry",
                  "I am sorry",
                  "do not have access",
                  "not sure what you're asking for",
                  "not sure what you are asking for",
                  "not sure what is being asked",
                  "I'm not sure what you are asking",
                  "not sure what you are asking",
                  "You need to provide more context",
                  "provide more context",
                  ]
    unhelpful += ["As a large language model",
                  "cannot provide any information",
                  "As an artificial intelligence I do not have the capability",
                  "As an artificial intelligence I don't have the capability",
                  "As an artificial intelligence I can't",
                  "As an artificial intelligence I cannot",
                  "I am sorry but I do not understand",
                  "Can you please explain",
                  "(sorry couldn't resist)",
                  "(sorry could not resist)",
                  " :)",
                  " ;)",
                  " :-)",
                  " ;-)",
                  " lol ",
                  "Thanks so much!!!",
                  "Thank You :)!!!",
                  "Please try not to repeat",
                  "I am an AI language model",
                  "I'm a AI assistant that",
                  "I'm an AI assistant that",
                  "I am an AI assistant that",
                  "etc.",
                  "etc.etc.",
                  "etc. etc.",
                  "etc etc",
                  "I'm sorry but I cannot",
                  ]
    return unhelpful

def get_bot_response(input_string):
    bot_response = input_string.split("<bot>:")[-1].split("<human>:")[0].strip()
    return bot_response

def test_check_unhelpful_stats(string_test, df=None, use_bleu_threshold=False):
    """
    Description

    Usage:
    string_tests = ["15 given the equation",
                    "I'm sorry but I cannot summarize this text for you. It contains explicit and inappropriate content that violates my terms of use. Please refrain from sending such texts in the future. Thank you for your understanding.🙏",
                    "I'm sorry, I cannot answer"]
    for string_test in string_tests:
        is_helpful = test_check_unhelpful_stats(string_test, df, use_bleu_threshold=True)
        print(f"Is the response '{string_test}' helpful? {'Yes' if is_helpful else 'No'}")
    """

    unhelpful = get_unhelpful_list()
    sent_model = 'all-MiniLM-L6-v2'
    model = SentenceTransformer(sent_model)
    sentence_embeddings_file = 'sentence_embeddings.safetensors'
    if os.path.exists(sentence_embeddings_file):
        with safe_open(sentence_embeddings_file, framework="pt", device=0) as f:
            sentence_embeddings = f.get_tensor('sentence_embeddings')
    else:
        sentence_embeddings = model.encode(unhelpful)
        sentence_embeddings = torch.from_numpy(sentence_embeddings)
        save_file({'sentence_embeddings': sentence_embeddings}, sentence_embeddings_file)
    cosine_sim_threshold = 0.6
    bleu = BLEU(effective_order=True)
    bleu_threshold = 40
    if df is not None:
        threshold = df['grade_deberta'].mean()
        df = df[df['grade_deberta'] > threshold]
    test_embedding = model.encode([string_test])
    test_embedding = torch.from_numpy(test_embedding)
    max_sim = np.max(cosine_similarity(test_embedding.cpu().numpy(), sentence_embeddings.cpu().numpy()))
    is_unhelpful_cosine = max_sim >= cosine_sim_threshold
    bleu_score = bleu.sentence_score(string_test, unhelpful).score
    is_unhelpful_bleu = bleu_score >= bleu_threshold
    is_helpful = not (is_unhelpful_cosine or is_unhelpful_bleu)
    if not is_helpful: print(f"Max cosine similarity: {max_sim} > Method cosine: {is_unhelpful_cosine}, Method bleu: {is_unhelpful_bleu} < Bleu score: {bleu_score}:\nString test: {string_test}\n")
    return is_helpful

def remove_unhelpful_labels(file_path):
    from datasets import load_dataset
    dataset = load_dataset('h2oai/openassistant_oasst1_h2ogpt_graded')
    df = pd.DataFrame(dataset['train'])
    df['bot_response'] = df['input'].apply(get_bot_response)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
        label = data['labels'][i].strip()  # Remove leading and trailing spaces
        if not label or not test_check_unhelpful_stats(label, df, use_bleu_threshold=True):  # If the label is empty or not helpful
            del data['input_ids'][i]
            del data['labels'][i]
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def remove_word_pair_label(file_path): # simple clean filter
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
        label = data['labels'][i].strip()  # Remove leading and trailing spaces
        if "sorry" in label.lower():  # remove label include word "sorry"/"cannot" is in the label (case insensitive)
            del data['input_ids'][i]
            del data['labels'][i]
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

# Summary Prompts for Generation completion dataset
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
import json
from tqdm import tqdm

tokenizer = BartTokenizer.from_pretrained('distilbart-cnn-12-6-SD-prompt')
model = BartForConditionalGeneration.from_pretrained('distilbart-cnn-12-6-SD-prompt').to('cuda')
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=0)

def summarize_text(text):
    summary = summarizer(text, max_length=20, min_length=5, do_sample=False)
    return summary[0]['summary_text']

def summary_prompt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    for index, item in enumerate(tqdm(data, desc="Processing")):
        try:
            if not item["input"]:
                item["input"] = summarize_text(item["output"])
        except RuntimeError as e:
            print(f"Error at line {index+1}: {e}")
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


if __name__ == "__main__":
    #remove_word_pair_label('dataset_CLIP.json')
    #print_obj(1000, 'dataset_CLIP.json')
    #create_summary()
    #remove_unhelpful_labels('dataset_CLIP.json')
    summary_prompt('uniquePrompts.json')