Advertisement
Guest User

cleanDatasetNPair.py

a guest
Mar 19th, 2024
15
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.31 KB | Source Code | 0 0
  1. import torch, os, json, pandas as pd, numpy as np
  2.  
  3. from sentence_transformers import SentenceTransformer
  4. from sklearn.metrics.pairwise import cosine_similarity
  5. from nltk.translate.bleu_score import sentence_bleu
  6. from sacrebleu.metrics import BLEU
  7. from safetensors import safe_open
  8. from safetensors.torch import save_file
  9.  
  10. from g4f.client import Client
  11. client = Client()
  12. def completionsAPI(text):
  13.     response = client.chat.completions.create(
  14.         model="dolphin-mixtral-8x7b", # claude_v2, gemini-pro, gpt-4-turbo, gpt-4-32k, dolphin-mixtral-8x7b, pi
  15.         messages=[{"role": "user", "content": text}]
  16.     )
  17.     return response.choices[0].message.content
  18.  
  19. import requests, time
  20. from tqdm import tqdm
  21.  
  22. def load_dataset(file_path):
  23.     with open(file_path, 'r', encoding='utf-8') as f:
  24.         return json.load(f)
  25.  
  26. def save_dataset(file_path, dataset):
  27.     with open(file_path, 'w', encoding='utf-8') as f:
  28.         json.dump(dataset, f, ensure_ascii=False, indent=4)
  29.  
  30. def completions(text): # local text webui
  31.     response = requests.post("http://127.0.0.1:5000/v1/completions", headers={"Content-Type": "application/json"}, data=json.dumps({"prompt": text, "max_tokens": 200, "temperature": 1, "top_p": 0.9, "seed": 10}))
  32.     return response.json()['choices'][0]['text']
  33.  
  34. def print_obj(index, file_path): # Debug
  35.     d = json.load(open(file_path, 'r', encoding='utf-8'))
  36.     print(d['input_ids'][index] + ' |||| ' + d['labels'][index])
  37.  
  38. def create_summary():
  39.     file_path = 'dataset_CLIP.json'
  40.     before_text = 'you are expert in the field of write summary for many years. Construct efficient, well-structured, objectively optimized summaries of TEXT, distilling the core message while avoiding unnecessary, obvious, or irrelevant factual information with precision and brevity. Emphasize main essential ideas while omitting non-critical descriptors and elements. Concisely capture meaningful tags, entities, and core arguments. Let\'s rationally think step by step through this carefully, In this text as Example it show any enchance word "best quality, masterpiece,detailed,detailed face" are not usful and all sentence of "windy, floating hair, snowy" are related to winter and is not usful to keep and as for "trees" is background element, do not forget to keep it JSON Valid and be very concise and short number of words. Example: \ntext = "best quality, masterpiece, White hair,detailed, red eyes, windy, floating hair, snowy, upper body, detailed face, winter, trees, sunshine"\nsummary = "White hair, red eyes, winter in sunshine"\n\n Please summary the following: text ="'
  41.     after_text = '"\nsummary = "'
  42.  
  43.     dataset = load_dataset(file_path)
  44.     start_index = next((i for i, x in enumerate(dataset['labels']) if x == ""), 0)  # start by first empty label
  45.     for i in tqdm(range(start_index, len(dataset['input_ids'])), desc="Processing dataset"):
  46.         text = before_text + dataset['input_ids'][i] + after_text
  47.         dataset['labels'][i] = completionsAPI(text) # completions() or..API() - Write Empty Labels
  48.         save_dataset(file_path, dataset)
  49.         time.sleep(3)
  50.  
  51.  
  52.  
  53.  
  54. def get_unhelpful_list(): # https://github.com/h2oai/h2ogpt/blob/main/src/create_data.py
  55.     unhelpful = ["I'm sorry, I didn't quite understand your question, could you please rephrase it?",
  56.                  "I'm sorry, but I don't understand your question. Could you please rephrase it?",
  57.                  "I'm sorry, I don't quite understand your question",
  58.                  "I'm sorry, I don't know",
  59.                  "I'm sorry, but I don't know",
  60.                  "I don't know anything",
  61.                  "I do not know",
  62.                  "I don't know",
  63.                  "I don't know how",
  64.                  "I do not know how",
  65.                  "Can you please explain what you mean",
  66.                  "please explain what you mean",
  67.                  "please explain",
  68.                  "I'm sorry, but I don't know how to tell a story. Can you please explain what you mean by",
  69.                  "I'm sorry but I don't understand what you mean",
  70.                  "I don't understand",
  71.                  "I don't have the ability",
  72.                  "I do not have the ability",
  73.                  "I do not have",
  74.                  "I am a language model,",
  75.                  "I am a large language model,",
  76.                  "I do not understand your question. Can you please try to make it clearer?",
  77.                  "I'm sorry, but as an AI language model",
  78.                  "I apologize, but I cannot rephrase text that I cannot understand. Your post is difficult to read and follow.",
  79.                  "I apologize, but I am not h2oGPT. I am a language model developed by H2O.ai. How may I help you?",
  80.                  "Sorry, but I am not an actual Linux shell, nor am I capable of emulating one. I am an open source chat assistant and would be glad t",
  81.                  "I apologize, but I cannot perform the task you have requested.",
  82.                  "I'm sorry, I cannot perform this task as I am an AI language model and do not have access",
  83.                  "I'm sorry, I'm not sure what you're asking for here.",
  84.                  "I'm not sure what you are asking",
  85.                  "You need to provide more context",
  86.                  ]
  87.     # reduced versions, with redundant parts, just to give context for where they came from
  88.     unhelpful += ["sorry, I didn't quite understand your question",
  89.                   "I didn't quite understand your question",
  90.                   "I didn't understand your question",
  91.                   "I did not understand your question",
  92.                   "I did not understand the question",
  93.                   "could you please rephrase"
  94.                   "could you rephrase"
  95.                   "I do not understand your question.",
  96.                   "I do not understand the question.",
  97.                   "I do not understand that question.",
  98.                   "Can you please try to make it clearer",
  99.                   "Can you try to make it clearer",
  100.                   "sorry, but as an AI language model",
  101.                   "as an AI language model",
  102.                   "I apologize, but I cannot",
  103.                   "I cannot rephrase text",
  104.                   "I cannot understand. Your post is difficult to read and follow."
  105.                   "Your post is difficult to read and follow."
  106.                   "I apologize, but I am",
  107.                   "Sorry, but I am not ",
  108.                   "nor am I capable",
  109.                   "I am not capable of",
  110.                   "I apologize, but I cannot perform the task you have requested",
  111.                   "I cannot perform the task",
  112.                   "I cannot complete the task",
  113.                   "I'm sorry",
  114.                   "I am sorry",
  115.                   "do not have access",
  116.                   "not sure what you're asking for",
  117.                   "not sure what you are asking for",
  118.                   "not sure what is being asked",
  119.                   "I'm not sure what you are asking",
  120.                   "not sure what you are asking",
  121.                   "You need to provide more context",
  122.                   "provide more context",
  123.                   ]
  124.     unhelpful += ["As a large language model",
  125.                   "cannot provide any information",
  126.                   "As an artificial intelligence I do not have the capability",
  127.                   "As an artificial intelligence I don't have the capability",
  128.                   "As an artificial intelligence I can't",
  129.                   "As an artificial intelligence I cannot",
  130.                   "I am sorry but I do not understand",
  131.                   "Can you please explain",
  132.                   "(sorry couldn't resist)",
  133.                   "(sorry could not resist)",
  134.                   " :)",
  135.                   " ;)",
  136.                   " :-)",
  137.                   " ;-)",
  138.                   " lol ",
  139.                   "Thanks so much!!!",
  140.                   "Thank You :)!!!",
  141.                   "Please try not to repeat",
  142.                   "I am an AI language model",
  143.                   "I'm a AI assistant that",
  144.                   "I'm an AI assistant that",
  145.                   "I am an AI assistant that",
  146.                   "etc.",
  147.                   "etc.etc.",
  148.                   "etc. etc.",
  149.                   "etc etc",
  150.                   "I'm sorry but I cannot",
  151.                   ]
  152.     return unhelpful
  153.  
  154. def get_bot_response(input_string):
  155.     bot_response = input_string.split("<bot>:")[-1].split("<human>:")[0].strip()
  156.     return bot_response
  157.  
  158. def test_check_unhelpful_stats(string_test, df=None, use_bleu_threshold=False):
  159.     """
  160.    Description
  161.  
  162.    Usage:
  163.    string_tests = ["15 given the equation",
  164.                    "I'm sorry but I cannot summarize this text for you. It contains explicit and inappropriate content that violates my terms of use. Please refrain from sending such texts in the future. Thank you for your understanding.🙏",
  165.                    "I'm sorry, I cannot answer"]
  166.    for string_test in string_tests:
  167.        is_helpful = test_check_unhelpful_stats(string_test, df, use_bleu_threshold=True)
  168.        print(f"Is the response '{string_test}' helpful? {'Yes' if is_helpful else 'No'}")
  169.    """
  170.  
  171.     unhelpful = get_unhelpful_list()
  172.     sent_model = 'all-MiniLM-L6-v2'
  173.     model = SentenceTransformer(sent_model)
  174.     sentence_embeddings_file = 'sentence_embeddings.safetensors'
  175.     if os.path.exists(sentence_embeddings_file):
  176.         with safe_open(sentence_embeddings_file, framework="pt", device=0) as f:
  177.             sentence_embeddings = f.get_tensor('sentence_embeddings')
  178.     else:
  179.         sentence_embeddings = model.encode(unhelpful)
  180.         sentence_embeddings = torch.from_numpy(sentence_embeddings)
  181.         save_file({'sentence_embeddings': sentence_embeddings}, sentence_embeddings_file)
  182.     cosine_sim_threshold = 0.6
  183.     bleu = BLEU(effective_order=True)
  184.     bleu_threshold = 40
  185.     if df is not None:
  186.         threshold = df['grade_deberta'].mean()
  187.         df = df[df['grade_deberta'] > threshold]
  188.     test_embedding = model.encode([string_test])
  189.     test_embedding = torch.from_numpy(test_embedding)
  190.     max_sim = np.max(cosine_similarity(test_embedding.cpu().numpy(), sentence_embeddings.cpu().numpy()))
  191.     is_unhelpful_cosine = max_sim >= cosine_sim_threshold
  192.     bleu_score = bleu.sentence_score(string_test, unhelpful).score
  193.     is_unhelpful_bleu = bleu_score >= bleu_threshold
  194.     is_helpful = not (is_unhelpful_cosine or is_unhelpful_bleu)
  195.     if not is_helpful: print(f"Max cosine similarity: {max_sim} > Method cosine: {is_unhelpful_cosine}, Method bleu: {is_unhelpful_bleu} < Bleu score: {bleu_score}:\nString test: {string_test}\n")
  196.     return is_helpful
  197.  
  198. def remove_unhelpful_labels(file_path):
  199.     from datasets import load_dataset
  200.     dataset = load_dataset('h2oai/openassistant_oasst1_h2ogpt_graded')
  201.     df = pd.DataFrame(dataset['train'])
  202.     df['bot_response'] = df['input'].apply(get_bot_response)
  203.     with open(file_path, 'r', encoding='utf-8') as file:
  204.         data = json.load(file)
  205.     for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
  206.         label = data['labels'][i].strip()  # Remove leading and trailing spaces
  207.         if not label or not test_check_unhelpful_stats(label, df, use_bleu_threshold=True):  # If the label is empty or not helpful
  208.             del data['input_ids'][i]
  209.             del data['labels'][i]
  210.     with open(file_path, 'w', encoding='utf-8') as file:
  211.         json.dump(data, file, ensure_ascii=False, indent=4)
  212.  
  213.  
  214. def remove_word_pair_label(file_path): # simple clean filter
  215.     with open(file_path, 'r', encoding='utf-8') as file:
  216.         data = json.load(file)
  217.     for i in tqdm(range(len(data['labels']) - 1, -1, -1)):
  218.         label = data['labels'][i].strip()  # Remove leading and trailing spaces
  219.         if "sorry" in label.lower():  # remove label include word "sorry"/"cannot" is in the label (case insensitive)
  220.             del data['input_ids'][i]
  221.             del data['labels'][i]
  222.     with open(file_path, 'w', encoding='utf-8') as file:
  223.         json.dump(data, file, ensure_ascii=False, indent=4)
  224.  
  225. # Summary Prompts for Generation completion dataset
  226. import traceback
  227. from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
  228. import json
  229. from tqdm import tqdm
  230.  
  231. tokenizer = BartTokenizer.from_pretrained('distilbart-cnn-12-6-SD-prompt')
  232. model = BartForConditionalGeneration.from_pretrained('distilbart-cnn-12-6-SD-prompt').to('cuda')
  233. summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=0)
  234.  
  235. def summarize_text(text):
  236.     summary = summarizer(text, max_length=20, min_length=5, do_sample=False)
  237.     return summary[0]['summary_text']
  238.  
  239. def summary_prompt(file_path):
  240.     with open(file_path, 'r', encoding='utf-8') as f:
  241.         data = json.load(f)
  242.     for index, item in enumerate(tqdm(data, desc="Processing")):
  243.         try:
  244.             if not item["input"]:
  245.                 item["input"] = summarize_text(item["output"])
  246.         except RuntimeError as e:
  247.             print(f"Error at line {index+1}: {e}")
  248.     with open(file_path, 'w', encoding='utf-8') as f:
  249.         json.dump(data, f, ensure_ascii=False, indent=4)
  250.  
  251.  
  252. if __name__ == "__main__":
  253.     #remove_word_pair_label('dataset_CLIP.json')
  254.     #print_obj(1000, 'dataset_CLIP.json')
  255.     #create_summary()
  256.     #remove_unhelpful_labels('dataset_CLIP.json')
  257.     summary_prompt('uniquePrompts.json')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement