llamaindex QA generate

import os
import re
from pathlib import Path
import openai
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex
from pathlib import Path
from llama_hub.file.pdf.base import PDFReader
from llama_hub.file.unstructured.base import UnstructuredReader
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index import Document
from llama_index.callbacks import CallbackManager
from llama_index.evaluation import DatasetGenerator
from llama_index.indices.list import SummaryIndex
from llama_index.node_parser import SimpleNodeParser
# try evaluation modules
from llama_index.evaluation import RelevancyEvaluator, FaithfulnessEvaluator
from llama_index import PromptTemplate
import json
from copy import deepcopy
import random
from llama_index import ServiceContext
from llama_index.llms import ChatMessage
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader, DirectoryLoader
import tiktoken
from llama_index.text_splitter import SentenceSplitter

os.environ["OPENAI_API_KEY"] = "sk-111111111111111111111111111111111111111111111111"
os.environ["OPENAI_API_BASE"] = "http://localhost:5001/v1"
openai.api_key = os.environ["OPENAI_API_KEY"]
openai.api_base = os.environ["OPENAI_API_BASE"]


docs = []
cleaned_text = []
doc_text_cleaned = []
docs_raw = []

def cleanup_text(input_text):
    # Remove all double brackets and characters within
    input_text = re.sub(r'\[.*?\]', '', input_text)
    # Remove all lines starting with #
    input_text = re.sub(r'^#.*', '', input_text, flags=re.MULTILINE)
    # Remove all asterisks
    input_text = re.sub(r'\*', '', input_text)
    # Remove all empty lines
    input_text = re.sub(r'^\s*$', '', input_text, flags=re.MULTILINE)
    # Remove all commas
    input_text = input_text.replace(',', '')
    # Remove all double spaces
    input_text = re.sub(r' +', ' ', input_text)
    # Replace newline characters with spaces
    input_text = input_text.replace('\n', ' ')
    return input_text


with open('output/Test.txt', 'r') as docs0:
    doc_text = docs0.read()

metadata = {"file_name": "Test.pdf"}
docs = [Document(text=doc_text, metadata=metadata)]

#print(docs[0].page_content())

callback_manager = CallbackManager([])

model = OpenAI(model="gpt-3.5-turbo", temperature=0.9),

service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.9),
    callback_manager=callback_manager,
)

service_context_large = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.9),
    callback_manager=callback_manager,
)

text_splitter = SentenceSplitter(
  separator="\n\n",
  chunk_size=512,
  chunk_overlap=0,
  paragraph_separator="\n\n\n",
  secondary_chunking_regex="[^,.;。]+[,.;。]?",
  tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

node_parser = SimpleNodeParser.from_defaults(text_splitter=text_splitter)
nodes = node_parser.get_nodes_from_documents(docs)

num_questions_per_chunk = 20
question_gen_query = (
    "You are a Teacher/ Professor. Your task is to setup "
    "a quiz/examination. Using the provided context, "
    f"formulate {num_questions_per_chunk} that captures an important fact from the "
    "context. \n"
    "You MUST obey the following criteria:\n"
    "- Restrict the question to the context information provided.\n"
    "- Do NOT create a question that cannot be answered from the context.\n"
    "- Phrase the question so that it does NOT refer to specific context. "
    'For instance, do NOT put phrases like "given provided context" or "in this work" in the question, '
    "because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms "
    "with specific details.\n"
    "BAD questions:\n"
    "What did the author do in his childhood\n"
    "What were the main findings in this report\n\n"
    "GOOD questions:\n"
    "What did Barack Obama do in his childhood\n"
    "What were the main findings in the original Transformers paper by Vaswani et al.\n\n"
    "Generate the questions below:\n"
)

fp = open("data/qa_pairs.jsonl", "w")
for idx, node in enumerate(nodes):
    print (node.text)

    dataset_generator = DatasetGenerator(
        [node],
        question_gen_query=question_gen_query,
        service_context=service_context,
        metadata_mode="all",
    )

    node_questions_0 = dataset_generator.generate_questions_from_nodes(num=20)
    print(f"[Node {idx}] Generated questions:\n {node_questions_0}")
    # for each question, get a response
    for question in node_questions_0:
        index = SummaryIndex([node], service_context=service_context)
        query_engine = index.as_query_engine()
        response = query_engine.query(question)
        out_dict = {"query": question, "response": str(response)}
        print(f"[Node {idx}] Outputs: {out_dict}")
        fp.write(json.dumps(out_dict) + "\n")

fp.close()

query_eval_tmpl = PromptTemplate(
    "Your task is to evaluate the following: If the response for the query isn't able to answer the question provided.\n"
    "If query isn't able to answer the question, answer NO.\n"
    "Otherwise answer YES.\n"
    "To elaborate, you might get an answer like the following: 'The context does not contain the answer to this question.'"
    "Please return NO in that case. "
    "You be given the query and response. Return YES or NO as the answer.\n"
    "Query: \n {query_str}\n"
    "Response: \n {response_str}\n"
    "Answer: "
)

eval_llm = OpenAI(model="gpt-3.5-turbo")

def filter_data(path: str, out_path: str):
    fp = open(path, "r")
    out_fp = open(out_path, "w")
    new_lines = []
    for idx, line in enumerate(fp):
        qa_pair = json.loads(line)
        eval = eval_llm.complete(
            query_eval_tmpl.format(
                query_str=qa_pair["query"], response_str=qa_pair["response"]
            )
        )

        print(f"[{idx}] QA Pair: {qa_pair} \n Eval: {eval}")
        if "NO" in str(eval):
            continue
        else:
            # new_lines.append(line)
            out_fp.write(line)

filter_data("data/qa_pairs.jsonl", "data/qa_pairs_2.jsonl")

def split_train_val(path: str, out_train_path: str, out_val_path: str, train_split=0.7):
    with open(path, "r") as fp:
        lines = fp.readlines()

        # shuffle the lines to make sure that the "train questions" cover most fo the context
        shuffled_lines = deepcopy(lines)
        random.shuffle(shuffled_lines)

        split_idx = int(train_split * len(shuffled_lines))
        train_lines = shuffled_lines[:split_idx]
        val_lines = shuffled_lines[split_idx:]
        with open(out_train_path, "w") as out_fp:
            out_fp.write("".join(train_lines))

        with open(out_val_path, "w") as out_fp:
            out_fp.write("".join(val_lines))

split_train_val(
    "data/qa_pairs_2.jsonl", "data/qa_pairs_train.jsonl", "data/qa_pairs_val.jsonl"
)

vp = open("data/qa_pairs_val.jsonl", "r")
fp = open("data/qa_pairs_train.jsonl", "r")
out_fp = open("data/qa_pairs_mistral.jsonl", "w")
out_vp = open("data/qa_pairs_mistral_val.jsonl", "w")

for line in fp:
    qa_pair = json.loads(line)
    out_dict = {
        "input": qa_pair["query"],
        "output": qa_pair["response"]
    }
    out_fp.write(json.dumps(out_dict) + "\n")

for line in vp:
    vp_pair = json.loads(line)
    out_dict_vp = {
        "input": vp_pair["query"],
        "output": vp_pair["response"]
    }
    out_vp.write(json.dumps(out_dict_vp) + "\n")