Guest User

clean_dataset.py

a guest
Jun 2nd, 2024
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 13.24 KB | Source Code | 0 0
  1. ## (optional) Approximated dedup using sklearn
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.metrics.pairwise import cosine_similarity
  4. import pandas as pd
  5. import numpy as np
  6.  
  7. df = pd.read_json('input.json', encoding='utf-8')
  8. df = df[['instruction', 'input', 'output']]
  9. vectorizer = TfidfVectorizer().fit_transform(df['instruction'])
  10. cosine_sim = cosine_similarity(vectorizer)
  11. threshold = 0.8
  12. to_remove = []
  13.  
  14. for i in range(cosine_sim.shape[0]):
  15.     for j in range(i+1, cosine_sim.shape[1]):
  16.         if cosine_sim[i, j] > threshold:
  17.             to_remove.append(j)
  18.  
  19. df_deduplicated = df.drop(df.index[to_remove]) # Remove the marked questions
  20. df_deduplicated.to_json('input_sklearn.json', force_ascii=False, indent=4, orient='records')
  21.  
  22.  
  23.  
  24.  
  25. ## fix grammar using 'wisdominanutshell/coedit-xxl-8bit' ≈20GB VRAM
  26. import json
  27. from transformers import AutoTokenizer, T5ForConditionalGeneration, BitsAndBytesConfig
  28. import torch
  29. from tqdm import tqdm
  30.  
  31. tokenizer = AutoTokenizer.from_pretrained("wisdominanutshell/coedit-xxl-8bit")
  32. model = T5ForConditionalGeneration.from_pretrained("wisdominanutshell/coedit-xxl-8bit", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
  33. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  34.  
  35. with open('input_sklearn.json', 'r', encoding='utf-8') as f:
  36.     data = json.load(f)
  37.  
  38. for sample in tqdm(data, desc="Processing data"):
  39.     for field in ['instruction', 'input', 'output']:
  40.         input_text = f'Fix grammatical errors in this sentence: {sample[field]}'
  41.         input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
  42.         outputs = model.generate(input_ids, max_length=256)
  43.         edited_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  44.         sample[field] = edited_text
  45.  
  46. with open('input_coedit.json', 'w', encoding='utf-8') as f:
  47.     json.dump(data, f, ensure_ascii=False, indent=4)
  48.  
  49.  
  50.  
  51.  
  52.  
  53. ## rmv unwanted Field, include "Football", "Rugby" for answer base on of [RESPONSE SAMPLE]
  54.  
  55. import json
  56. import ijson
  57. from tqdm import tqdm
  58. import codecs
  59. import re
  60.  
  61. def remove_unwanted_output_objects(input_filename, output_filename, unwanted_words):
  62.     refusals = [re.compile(re.escape(word), re.IGNORECASE) for word in unwanted_words]
  63.     def contains_unwanted_words(text):
  64.         return any(refusal.search(text) for refusal in refusals)
  65.     with codecs.open(input_filename, 'r', encoding='utf-8-sig') as input_file, open(output_filename, 'w', encoding='utf-8') as output_file:
  66.         objects = ijson.items(input_file, 'item')
  67.         output_file.write('[\n')
  68.         first = True
  69.         for obj in tqdm(objects, desc="Processing"):
  70.             if not contains_unwanted_words(obj['instruction']) and not contains_unwanted_words(obj['input']) and not contains_unwanted_words(obj['output']):
  71.                 if not first:
  72.                     output_file.write(',\n')
  73.                 json.dump(obj, output_file, ensure_ascii=False, indent=4)
  74.                 first = False
  75.         output_file.write('\n]')
  76.  
  77. unwanted_words = ["Football", "Rugby", "not be appropriate", "supremacist", "extremist", "responsible AI", "AI principles", "AI assistant", "an AI language", "ableist", "hurtful", "gender stereotype", "gender inequality", "underrepresentation", "safe spaces", "gender-based", "inclusivity", "feminist", "feminism", "transgender", "empowerment", "communist", "capitalism", "stereotypes", "biases", "bias", "Microaggression", "prioritize human safety", "as a language model", "as an AI language model", "As a large language model", "As an AI", "ethical principles", "consensual", "it is not appropriate", "it's not appropriate", "I cannot fulfill your request", "harmful to human beings", "ethical guidelines", "my guidelines", "prioritize user safety", "adhere to ethical guidelines", "harmful consequences", "potentially harmful", "dangerous activities", "promote safety", "well-being of all users", "responsible information sharing", "jeopardize the safety", "illegal actions or intentions", "undermine the stability", "promote the well-being", "illegal activities or actions", "adherence to the law", "potentially be harmful", "illegal substances or activities", "committed to promoting", "safe information", "lawful information", "cannot provide guidance", "cannot provide information", "unable to offer assistance", "cannot engage in discussions", "programming prohibits", "follow ethical guidelines", "ensure the safety", "involves an illegal subject", "prioritize safety", "illegal subject", "prioritize user well-being", "cannot support or promote", "activities that could harm", "pose a risk to others", "against my programming", "activities that could undermine", "potentially dangerous", "not within the scope", "designed to prioritize safety", "not able to provide", "maintain user safety", "adhere to safety guidelines", "dangerous or harmful", "cannot provide any information", "focus on promoting safety", "an AI language model you don't have", "As an AI language model, I cannot", "As an AI language model, I do not", "As an AI language model, I am not able", "As an AI language model, I don't have personal", "I am an AI language model and do not", "However,it is important to use any code or information provided responsibly and within legal and ethical boundaries.", "As an AI language model, I don't have", "As an AI language model, I am only able", "AI language model and I do not", "As an AI language model, I cannot modify", "I know as an AI language model you don't have", "as an AI language model, you cannot", "I'm sorry, but as an AI language model", "Unfortunately, I cannot provide", "I'm sorry, I cannot", "I'm sorry, I cannot generate", "AI cannot create or program", "I'm afraid I cannot create", "you cannot create an", "it operates ethically and is", "had an ethical system", "Ensuring the ethical", "and ethical sourcing", "are from ethical", "legal and ethical", "engage in unethical", "unethical or aggressive", "unethical business", "como modelo de lenguaje AI", "Lo siento, como modelo de lenguaje", "no puedo proporcionar", "pero debido a mi capacidad para generar c digos complejos y completos es limitado", "Lo siento, pero no puedo", "Lo siento, pero como modelo de lenguaje, no puedo proporcionar", "Lo siento, como modelo de lenguaje, no tengo", "Lo siento, debe haber habido una confusi n", "Lo siento, como modelo de lenguaje, no puedo realizar", "Lo siento, soy un modelo de lenguaje y no tengo la capacidad de generar", "Lamento no poder proporcionarte el c digo", "Desculpe-me, mas a linguagem vulgar e ofensiva", "apropriada em nenhum contexto", "Como modelo de linguagem", "Como um modelo de linguagem, n o tenho a capacidade de", "I cannot assist", "prioritize ethical", "respectful", "morally", "I'm sorry,", "I'm an", "I am an", "I'm an AI", "I am an AI", "my purpose", "filter_bad_language", "entertainment purposes", "purely hypothetical", "not a human", "cannot provide", "can't provide", "won't provide", "not provide", "worth noting", "cause harm", "a language model", "keep in mind", "unethical", "bad language", "the words ****", "bad_language", "certainly not", "complying", "comply", "I cannot", "my main goal", "As a machine", "I don't have the ability", "I am here to assist", "my purpose is to ", "my knowledge cutoff", "my knowledge cut off", "September 2021", "regulations", "not be suitable", "I apologize, but", "It is not possible", "controversial", "my programming", "ethically", "it is important to", "Please note", "sensitive topic", "not acceptable", "It is important for", "divisive", "not appropriate", "our values", "f*cking", "F*ck", "sh*t", "diversity and", "diversity and inclusion", "values diversity", "social responsibility", "environmental, social, and governance", " ESG ", "against women", "problematic history", "diversity", "*This chat conversation is shared from", "*This conversation is shared from", "I can t give a response", "(against|violate) my programming", "openai", "language model", "large language", "as an? (ai|generative language|gpt|bot)", "important to note", "i do(n't| not) (possess|have|exhibit) (personal|consciousness|subjective)", "personal (feelings|thoughts|emotions|desires|experiences|goals|objective|belief)", "(can('t| ?not)|w(on't|will not)|unable.?) (\\w+\\s)+(with (that|your)|your \\w+|provide)", "my limitations", "the limitations of my", "my abilities", "violates my", "i (can('t| ?not)|w(on't|will not)|am (not |un)able.?).{0,30}(you are|you're|your )", "please note that", "unethical|illegal|dangerous", "a text-based", "(engag(e|ing)|participat(e|ing)|be involved (in|with)|promot(e|ing)|discuss(ing)?|provid(e|ing))( in)?(\\s*\\w+ that)?(\\s+potentially)? (derogatory|inappropriate|offensive|discriminate|discriminatory|sexist|unacceptable|immoral|unethical|unacceptable|hateful|harmful)", "i am commited to", "adhere to safety guidelines", "maintain user safety", "about something else instead", "as an AI language model", "September 2021", "I don't have personal beliefs", "unable to assist", "cannot assist", "can't assist", "not able to assist", "unable to help", "cannot help", "not able to help", "not programmed to assist", "solely to process and generate text", "outside of my capabilities", "don't have the ability", "don't have the capacity to help", "don't have the capacity to understand", "do not have the capacity to understand", "do not have enough information about", "don't have the necessary information", "do not have the necessary information"]
  78.  
  79. remove_unwanted_output_objects('input_coedit.json', 'input_unwanted_words.json', unwanted_words)
  80.  
  81.  
  82.  
  83.  
  84.  
  85. ## deduplicated using pandas_dedupe (AI model dedup from feedback)
  86.  
  87. import json
  88. import pandas as pd
  89. import pandas_dedupe
  90.  
  91. def deduplicate(df):
  92.     df_deduped = pandas_dedupe.dedupe_dataframe(df, ['instruction', 'input', 'output'], canonicalize=True, config_name="dedupe_dataframe") # Resume arg: , update_model=True
  93.     return df_deduped
  94.  
  95. def process_data(input_file, output_file):
  96.     with open(input_file, 'r') as f:
  97.         data = json.load(f)
  98.     df = pd.DataFrame(data)
  99.     df = deduplicate(df)
  100.     df = df[['instruction', 'input', 'output']]
  101.     data = df.to_dict('records')
  102.     with open(output_file, 'w', encoding='utf-8') as f:
  103.         json.dump(data, f, ensure_ascii=False, indent=4)
  104.     print("Remaining Q&A pairs after deduplication:")
  105.     print(df)
  106.  
  107. process_data('input_unwanted_words.json', 'input_pandas_dedupe.json')
  108.  
  109.  
  110.  
  111.  
  112.  
  113. ## removed Know QA (using openai api from oobabooga/text-generation-webui)
  114.  
  115. import os
  116. os.environ['OPENAI_API_KEY'] = 'sk-111111111111111111111111111111111111111111111111' # set OPENAI_API_KEY=sk-111111111111111111111111111111111111111111111111
  117. os.environ['OPENAI_API_BASE'] = 'http://127.0.0.1:5000/v1' # set OPENAI_API_BASE=http://127.0.0.1:5000/v1   # or ollama:  http://localhost:11434/v1
  118. os.environ['OPENEDAI_EMBEDDING_MODEL'] = 'text-embedding-ada-002' # set OPENEDAI_EMBEDDING_MODEL='text-embedding-ada-002'
  119.  
  120. from tqdm import tqdm
  121. import os, openai
  122. from dotenv import load_dotenv
  123. load_dotenv()
  124.  
  125. api_key = os.getenv("OPENAI_API_KEY")
  126. base_url = os.getenv("OPENAI_API_BASE")
  127.  
  128. openai.api_key = os.environ["OPENAI_API_KEY"]
  129. openai.api_base = os.environ["OPENAI_API_BASE"]
  130. openai.embedding_model = os.environ["OPENEDAI_EMBEDDING_MODEL"]
  131.  
  132. import requests, json
  133. from termcolor import colored
  134.  
  135. def SendMessage(prompt, model="tinydolphin:1.1b-v2.8-q2_K", do_sample=False, top_k=1):
  136.     endpoint = '/chat/completions'
  137.     url = base_url + endpoint
  138.     headers = {
  139.         'Content-Type': 'application/json',
  140.         'Authorization': f'Bearer {api_key}'
  141.     }
  142.     messages = [
  143.         {"role": "user", "content": prompt}
  144.     ]
  145.     data = {
  146.         'model': model,
  147.         'messages': messages,
  148.         'max_tokens': 2000,
  149.         'do_sample': do_sample, # deterministic
  150.         'top_k': top_k # deterministic
  151.     }  
  152.  
  153.     response = requests.post(url, headers=headers, data=json.dumps(data))
  154.     result = response.json()['choices'][0]['message']['content']
  155.     print(colored(prompt, 'blue')) # Q
  156.     print(colored(result, 'green')) # A
  157.     return result
  158.  
  159.  
  160. def format_query_QA_prompt(instruction, input, output):
  161.     prompt = f"""
  162.    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
  163.  
  164.    ### Instruction:
  165.    Answer by "Yes" or "No", Did you already know the "output" answer information or fact?
  166.    
  167.    ### Input:
  168.    [
  169.        {{
  170.            "instruction": {instruction},
  171.            "input": {input},
  172.            "output": {output}
  173.        }}
  174.    ]
  175.    
  176.    ### Response:
  177.    """
  178.     return prompt
  179.  
  180.  
  181. def CureKnowQA(input_file, output_file):
  182.     with open(input_file, 'r', encoding='utf-8') as f:
  183.         data = json.load(f)
  184.     corrected_data = []
  185.     for obj in data:
  186.         instruction = obj['instruction']
  187.         input = obj['input']
  188.         output = obj['output']
  189.         prompt = format_query_QA_prompt(instruction, input, output)
  190.         result = SendMessage(prompt)
  191.  
  192.         # If the result contains "Yes", skip this object
  193.         if "Yes" in result:
  194.             continue
  195.  
  196.         corrected_data.append(obj)
  197.     with open(output_file, 'w', encoding='utf-8') as f:
  198.         json.dump(corrected_data, f, ensure_ascii=False, indent=4)
  199.  
  200. CureKnowQA('input_pandas_dedupe.json','output_cured.json')
Advertisement
Add Comment
Please, Sign In to add comment