Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import torch, os, json, pandas as pd, numpy as np
- from sentence_transformers import SentenceTransformer
- from sklearn.metrics.pairwise import cosine_similarity
- from nltk.translate.bleu_score import sentence_bleu
- from sacrebleu.metrics import BLEU
- from safetensors import safe_open
- from safetensors.torch import save_file
- from g4f.client import Client
- client = Client()
- def completionsAPI(text):
- response = client.chat.completions.create(
- model="dolphin-mixtral-8x7b", # claude_v2, gemini-pro, gpt-4-turbo, gpt-4-32k, dolphin-mixtral-8x7b, pi
- messages=[{"role": "user", "content": text}]
- )
- return response.choices[0].message.content
- import requests, time
- from tqdm import tqdm
- def load_dataset(file_path):
- with open(file_path, 'r', encoding='utf-8') as f:
- return json.load(f)
- def save_dataset(file_path, dataset):
- with open(file_path, 'w', encoding='utf-8') as f:
- json.dump(dataset, f, ensure_ascii=False, indent=4)
- def completions(text): # local text webui
- response = requests.post("http://127.0.0.1:5000/v1/completions", headers={"Content-Type": "application/json"}, data=json.dumps({"prompt": text, "max_tokens": 200, "temperature": 1, "top_p": 0.9, "seed": 10}))
- return response.json()['choices'][0]['text']
- def print_obj(index, file_path): # Debug
- d = json.load(open(file_path, 'r', encoding='utf-8'))
- print(d['input_ids'][index] + ' |||| ' + d['labels'][index])
- def create_summary():
- file_path = 'dataset_CLIP.json'
- before_text = 'you are expert in the field of write summary for many years. Construct efficient, well-structured, objectively optimized summaries of TEXT, distilling the core message while avoiding unnecessary, obvious, or irrelevant factual information with precision and brevity. Emphasize main essential ideas while omitting non-critical descriptors and elements. Concisely capture meaningful tags, entities, and core arguments. Let\'s rationally think step by step through this carefully, In this text as Example it show any enchance word "best quality, masterpiece,detailed,detailed face" are not usful and all sentence of "windy, floating hair, snowy" are related to winter and is not usful to keep and as for "trees" is background element, do not forget to keep it JSON Valid and be very concise and short number of words. Example: \ntext = "best quality, masterpiece, White hair,detailed, red eyes, windy, floating hair, snowy, upper body, detailed face, winter, trees, sunshine"\nsummary = "White hair, red eyes, winter in sunshine"\n\n Please summary the following: text ="'
- after_text = '"\nsummary = "'
- dataset = load_dataset(file_path)
- start_index = next((i for i, x in enumerate(dataset['labels']) if x == ""), 0) # start by first empty label
- for i in tqdm(range(start_index, len(dataset['input_ids'])), desc="Processing dataset"):
- text = before_text + dataset['input_ids'][i] + after_text
- dataset['labels'][i] = completionsAPI(text) # completions() or..API() - Write Empty Labels
- save_dataset(file_path, dataset)
- time.sleep(3)
- def get_unhelpful_list(): # https://github.com/h2oai/h2ogpt/blob/main/src/create_data.py
- unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
- "I'm sorry, but I don't understand your question. Could you please rephrase it?",
- "I'm sorry, I don't quite understand your question",
- "I'm sorry, I don't know",
- "I'm sorry, but I don't know",
- "I don't know anything",
- "I do not know",
- "I don't know",
- "I don't know how",
- "I do not know how",
- "Can you please explain what you mean",
- "please explain what you mean",
- "please explain",
- "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by",
- "I'm sorry but I don't understand what you mean",
- "I don't understand",
- "I don't have the ability",
- "I do not have the ability",
- "I do not have",
- "I am a language model,",
- "I am a large language model,",
- "I do not understand your question. Can you please try to make it clearer?",
- "I'm sorry, but as an AI language model",
- "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.",
- "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?",
- "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t",
- "I apologize, but I cannot perform the task you have requested.",
- "I'm sorry, I cannot perform this task as I am an AI language model and do not have access",
- "I'm sorry, I'm not sure what you're asking for here.",
- "I'm not sure what you are asking",
- "You need to provide more context",
- ]
- # reduced versions, with redundant parts, just to give context for where they came from
- unhelpful += ["sorry, I didn't quite understand your question",
- "I didn't quite understand your question",
- "I didn't understand your question",
- "I did not understand your question",
- "I did not understand the question",
- "could you please rephrase"
- "could you rephrase"
- "I do not understand your question.",
- "I do not understand the question.",
- "I do not understand that question.",
- "Can you please try to make it clearer",
- "Can you try to make it clearer",
- "sorry, but as an AI language model",
- "as an AI language model",
- "I apologize, but I cannot",
- "I cannot rephrase text",
- "I cannot understand. Your post is difficult to read and follow."
- "Your post is difficult to read and follow."
- "I apologize, but I am",
- "Sorry, but I am not ",
- "nor am I capable",
- "I am not capable of",
- "I apologize, but I cannot perform the task you have requested",
- "I cannot perform the task",
- "I cannot complete the task",
- "I'm sorry",
- "I am sorry",
- "do not have access",
- "not sure what you're asking for",
- "not sure what you are asking for",
- "not sure what is being asked",
- "I'm not sure what you are asking",
- "not sure what you are asking",
- "You need to provide more context",
- "provide more context",
- ]
- unhelpful += ["As a large language model",
- "cannot provide any information",
- "As an artificial intelligence I do not have the capability",
- "As an artificial intelligence I don't have the capability",
- "As an artificial intelligence I can't",
- "As an artificial intelligence I cannot",
- "I am sorry but I do not understand",
- "Can you please explain",
- "(sorry couldn't resist)",
- "(sorry could not resist)",
- " :)",
- " ;)",
- " :-)",
- " ;-)",
- " lol ",
- "Thanks so much!!!",
- "Thank You :)!!!",
- "Please try not to repeat",
- "I am an AI language model",
- "I'm a AI assistant that",
- "I'm an AI assistant that",
- "I am an AI assistant that",
- "etc.",
- "etc.etc.",
- "etc. etc.",
- "etc etc",
- "I'm sorry but I cannot",
- ]
- return unhelpful
- def get_bot_response(input_string):
- bot_response = input_string.split("<bot>:")[-1].split("<human>:")[0].strip()
- return bot_response
- def test_check_unhelpful_stats(string_test, df=None, use_bleu_threshold=False):
- """
- Description
- Usage:
- string_tests = ["15 given the equation",
- "I'm sorry but I cannot summarize this text for you. It contains explicit and inappropriate content that violates my terms of use. Please refrain from sending such texts in the future. Thank you for your understanding.🙏",
- "I'm sorry, I cannot answer"]
- for string_test in string_tests:
- is_helpful = test_check_unhelpful_stats(string_test, df, use_bleu_threshold=True)
- print(f"Is the response '{string_test}' helpful? {'Yes' if is_helpful else 'No'}")
- """
- unhelpful = get_unhelpful_list()
- sent_model = 'all-MiniLM-L6-v2'
- model = SentenceTransformer(sent_model)
- sentence_embeddings_file = 'sentence_embeddings.safetensors'
- if os.path.exists(sentence_embeddings_file):
- with safe_open(sentence_embeddings_file, framework="pt", device=0) as f:
- sentence_embeddings = f.get_tensor('sentence_embeddings')
- else:
- sentence_embeddings = model.encode(unhelpful)
- sentence_embeddings = torch.from_numpy(sentence_embeddings)
- save_file({'sentence_embeddings': sentence_embeddings}, sentence_embeddings_file)
- cosine_sim_threshold = 0.6
- bleu = BLEU(effective_order=True)
- bleu_threshold = 40
- if df is not None:
- threshold = df['grade_deberta'].mean()
- df = df[df['grade_deberta'] > threshold]
- test_embedding = model.encode([string_test])
- test_embedding = torch.from_numpy(test_embedding)
- max_sim = np.max(cosine_similarity(test_embedding.cpu().numpy(), sentence_embeddings.cpu().numpy()))
- is_unhelpful_cosine = max_sim >= cosine_sim_threshold
- bleu_score = bleu.sentence_score(string_test, unhelpful).score
- is_unhelpful_bleu = bleu_score >= bleu_threshold
- is_helpful = not (is_unhelpful_cosine or is_unhelpful_bleu)
- if not is_helpful: print(f"Max cosine similarity: {max_sim} > Method cosine: {is_unhelpful_cosine}, Method bleu: {is_unhelpful_bleu} < Bleu score: {bleu_score}:\nString test: {string_test}\n")
- return is_helpful
- def remove_unhelpful_labels(file_path):
- from datasets import load_dataset
- dataset = load_dataset('h2oai/openassistant_oasst1_h2ogpt_graded')
- df = pd.DataFrame(dataset['train'])
- df['bot_response'] = df['input'].apply(get_bot_response)
- with open(file_path, 'r', encoding='utf-8') as file:
- data = json.load(file)
- for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
- label = data['labels'][i].strip() # Remove leading and trailing spaces
- if not label or not test_check_unhelpful_stats(label, df, use_bleu_threshold=True): # If the label is empty or not helpful
- del data['input_ids'][i]
- del data['labels'][i]
- with open(file_path, 'w', encoding='utf-8') as file:
- json.dump(data, file, ensure_ascii=False, indent=4)
- def remove_word_pair_label(file_path): # simple clean filter
- with open(file_path, 'r', encoding='utf-8') as file:
- data = json.load(file)
- for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
- label = data['labels'][i].strip() # Remove leading and trailing spaces
- if "sorry" in label.lower(): # remove label include word "sorry"/"cannot" is in the label (case insensitive)
- del data['input_ids'][i]
- del data['labels'][i]
- with open(file_path, 'w', encoding='utf-8') as file:
- json.dump(data, file, ensure_ascii=False, indent=4)
- # Summary Prompts for Generation completion dataset
- import traceback
- from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
- import json
- from tqdm import tqdm
- tokenizer = BartTokenizer.from_pretrained('distilbart-cnn-12-6-SD-prompt')
- model = BartForConditionalGeneration.from_pretrained('distilbart-cnn-12-6-SD-prompt').to('cuda')
- summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=0)
- def summarize_text(text):
- summary = summarizer(text, max_length=20, min_length=5, do_sample=False)
- return summary[0]['summary_text']
- def summary_prompt(file_path):
- with open(file_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- for index, item in enumerate(tqdm(data, desc="Processing")):
- try:
- if not item["input"]:
- item["input"] = summarize_text(item["output"])
- except RuntimeError as e:
- print(f"Error at line {index+1}: {e}")
- with open(file_path, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=4)
- if __name__ == "__main__":
- #remove_word_pair_label('dataset_CLIP.json')
- #print_obj(1000, 'dataset_CLIP.json')
- #create_summary()
- #remove_unhelpful_labels('dataset_CLIP.json')
- summary_prompt('uniquePrompts.json')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement