Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import re
- from pathlib import Path
- import openai
- from llama_index import ServiceContext
- from llama_index.llms import OpenAI
- from llama_index import VectorStoreIndex
- from pathlib import Path
- from llama_hub.file.pdf.base import PDFReader
- from llama_hub.file.unstructured.base import UnstructuredReader
- from llama_hub.file.pymu_pdf.base import PyMuPDFReader
- from llama_index import Document
- from llama_index.callbacks import CallbackManager
- from llama_index.evaluation import DatasetGenerator
- from llama_index.indices.list import SummaryIndex
- from llama_index.node_parser import SimpleNodeParser
- # try evaluation modules
- from llama_index.evaluation import RelevancyEvaluator, FaithfulnessEvaluator
- from llama_index import PromptTemplate
- import json
- from copy import deepcopy
- import random
- from llama_index import ServiceContext
- from llama_index.llms import ChatMessage
- from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
- from langchain.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader
- import tiktoken
- from llama_index.text_splitter import SentenceSplitter
- os.environ["OPENAI_API_KEY"] = "sk-111111111111111111111111111111111111111111111111"
- os.environ["OPENAI_API_BASE"] = "http://localhost:5001/v1"
- openai.api_key = os.environ["OPENAI_API_KEY"]
- openai.api_base = os.environ["OPENAI_API_BASE"]
- docs = []
- cleaned_text = []
- doc_text_cleaned = []
- docs_raw = []
- def cleanup_text(input_text):
- # Remove all double brackets and characters within
- input_text = re.sub(r'\[.*?\]', '', input_text)
- # Remove all lines starting with #
- input_text = re.sub(r'^#.*', '', input_text, flags=re.MULTILINE)
- # Remove all asterisks
- input_text = re.sub(r'\*', '', input_text)
- # Remove all empty lines
- input_text = re.sub(r'^\s*$', '', input_text, flags=re.MULTILINE)
- # Remove all commas
- input_text = input_text.replace(',', '')
- # Remove all double spaces
- input_text = re.sub(r' +', ' ', input_text)
- # Replace newline characters with spaces
- input_text = input_text.replace('\n', ' ')
- return input_text
- with open('output/Test.txt', 'r') as docs0:
- doc_text = docs0.read()
- metadata = {"file_name": "Test.pdf"}
- docs = [Document(text=doc_text, metadata=metadata)]
- #print(docs[0].page_content())
- callback_manager = CallbackManager([])
- model = OpenAI(model="gpt-3.5-turbo", temperature=0.9),
- service_context = ServiceContext.from_defaults(
- llm=OpenAI(model="gpt-3.5-turbo", temperature=0.9),
- callback_manager=callback_manager,
- )
- service_context_large = ServiceContext.from_defaults(
- llm=OpenAI(model="gpt-3.5-turbo", temperature=0.9),
- callback_manager=callback_manager,
- )
- text_splitter = SentenceSplitter(
- separator="\n\n",
- chunk_size=512,
- chunk_overlap=0,
- paragraph_separator="\n\n\n",
- secondary_chunking_regex="[^,.;。]+[,.;。]?",
- tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
- )
- node_parser = SimpleNodeParser.from_defaults(text_splitter=text_splitter)
- nodes = node_parser.get_nodes_from_documents(docs)
- num_questions_per_chunk = 20
- question_gen_query = (
- "You are a Teacher/ Professor. Your task is to setup "
- "a quiz/examination. Using the provided context, "
- f"formulate {num_questions_per_chunk} that captures an important fact from the "
- "context. \n"
- "You MUST obey the following criteria:\n"
- "- Restrict the question to the context information provided.\n"
- "- Do NOT create a question that cannot be answered from the context.\n"
- "- Phrase the question so that it does NOT refer to specific context. "
- 'For instance, do NOT put phrases like "given provided context" or "in this work" in the question, '
- "because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms "
- "with specific details.\n"
- "BAD questions:\n"
- "What did the author do in his childhood\n"
- "What were the main findings in this report\n\n"
- "GOOD questions:\n"
- "What did Barack Obama do in his childhood\n"
- "What were the main findings in the original Transformers paper by Vaswani et al.\n\n"
- "Generate the questions below:\n"
- )
- fp = open("data/qa_pairs.jsonl", "w")
- for idx, node in enumerate(nodes):
- print (node.text)
- dataset_generator = DatasetGenerator(
- [node],
- question_gen_query=question_gen_query,
- service_context=service_context,
- metadata_mode="all",
- )
- node_questions_0 = dataset_generator.generate_questions_from_nodes(num=20)
- print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
- # for each question, get a response
- for question in node_questions_0:
- index = SummaryIndex([node], service_context=service_context)
- query_engine = index.as_query_engine()
- response = query_engine.query(question)
- out_dict = {"query": question, "response": str(response)}
- print(f"[Node {idx}] Outputs: {out_dict}")
- fp.write(json.dumps(out_dict) + "\n")
- fp.close()
- query_eval_tmpl = PromptTemplate(
- "Your task is to evaluate the following: If the response for the query isn't able to answer the question provided.\n"
- "If query isn't able to answer the question, answer NO.\n"
- "Otherwise answer YES.\n"
- "To elaborate, you might get an answer like the following: 'The context does not contain the answer to this question.'"
- "Please return NO in that case. "
- "You be given the query and response. Return YES or NO as the answer.\n"
- "Query: \n {query_str}\n"
- "Response: \n {response_str}\n"
- "Answer: "
- )
- eval_llm = OpenAI(model="gpt-3.5-turbo")
- def filter_data(path: str, out_path: str):
- fp = open(path, "r")
- out_fp = open(out_path, "w")
- new_lines = []
- for idx, line in enumerate(fp):
- qa_pair = json.loads(line)
- eval = eval_llm.complete(
- query_eval_tmpl.format(
- query_str=qa_pair["query"], response_str=qa_pair["response"]
- )
- )
- print(f"[{idx}] QA Pair: {qa_pair} \n Eval: {eval}")
- if "NO" in str(eval):
- continue
- else:
- # new_lines.append(line)
- out_fp.write(line)
- filter_data("data/qa_pairs.jsonl", "data/qa_pairs_2.jsonl")
- def split_train_val(path: str, out_train_path: str, out_val_path: str, train_split=0.7):
- with open(path, "r") as fp:
- lines = fp.readlines()
- # shuffle the lines to make sure that the "train questions" cover most fo the context
- shuffled_lines = deepcopy(lines)
- random.shuffle(shuffled_lines)
- split_idx = int(train_split * len(shuffled_lines))
- train_lines = shuffled_lines[:split_idx]
- val_lines = shuffled_lines[split_idx:]
- with open(out_train_path, "w") as out_fp:
- out_fp.write("".join(train_lines))
- with open(out_val_path, "w") as out_fp:
- out_fp.write("".join(val_lines))
- split_train_val(
- "data/qa_pairs_2.jsonl", "data/qa_pairs_train.jsonl", "data/qa_pairs_val.jsonl"
- )
- vp = open("data/qa_pairs_val.jsonl", "r")
- fp = open("data/qa_pairs_train.jsonl", "r")
- out_fp = open("data/qa_pairs_mistral.jsonl", "w")
- out_vp = open("data/qa_pairs_mistral_val.jsonl", "w")
- for line in fp:
- qa_pair = json.loads(line)
- out_dict = {
- "input": qa_pair["query"],
- "output": qa_pair["response"]
- }
- out_fp.write(json.dumps(out_dict) + "\n")
- for line in vp:
- vp_pair = json.loads(line)
- out_dict_vp = {
- "input": vp_pair["query"],
- "output": vp_pair["response"]
- }
- out_vp.write(json.dumps(out_dict_vp) + "\n")
Add Comment
Please, Sign In to add comment