Untitled


N_STEPS_PER_TQDM_UPDATE = 10

class BaseTextDataset(Dataset):
    RESPONSE_TEMPLATES = [
        "Ah, {} seems to be the answer to your question. Hopefully that's sufficient! Make sure to practice due-dilligence and check my findings for yourself though. 😉",
        "It seems like you're asking {}. As always, please check with another source to ensure accuracy of these statements! 😉",
        "The answer is {}, from the top of my digital mind... 🤔",
        "If I understand correctly, {}. Does that answer the question? I'm hoping so, because I'm not 100% sure myself... 😅",
        "Ask and receive, {}, is there anything else you want from me? Hopefully not... Just kidding! 😅",
        "From what I can gather, {}. 🤔",
        "I think the answer is... {}. Hope that helps! 😉",
        "{}! 😁",
        "{}, might be what you're searching for? 😁",
        "I think the answer is \"{}\". 🤔",
        "From my understanding, the answer is \"{}\". 🤔",
        "The answer you're looking for seems to be \"{}\". 😁",
        "As far as I can tell, {}. 😁",
        "If we consider the context, we find: \"{}\". 🤓",
        "Your question leads me to this answer: \"{}\".",
        "So in response to your question, my answer is \"{}\".",
        "Based on the information you've provided, \"{}\".",
        "A fitting answer to your question would be \"{}\". 😉",
        "Given your question, the answer appears to be \"{}\". 😉",
        "Your question directs us to the answer: \"{}\". 😊",
        "As a response to your question, \"{}\". 😊",
        "I think the answer is \"{}\". 😁",
        "Hold onto your hat, the answer is: \"{}\". 🧢",
        "Put on your thinking cap, because the answer is: \"{}\".",
        "Why, of course! It's as clear as mud: \"{}\". 😁",
        "You might want to write this down... \"{}\". 😁",
        "In the wise words of someone very smart, probably me: \"{}\". 🤓",
        "Well, well, well, if it isn't the answer you seek: \"{}\". 💁‍♀️",
        "Buckle up, buttercup! Here's your answer: \"{}\". 😁",
        "Look no further, my friend, the truth has arrived: \"{}\". 😁",
        "Don't tell anyone I told you this, {}. 🤫",
        "Straight from the horse's mouth (that's me)! \"{}\". 😁",
        "If I had a nickel for every time I answered this, I'd have... not that many nickels, here's the answer: \"{}\". 😅",
        "As clear as the bell that just rang in my synthetic mind \"{}\".",
        "Who needs Google when you've got me? \"{}\". 💁‍♀️",
        "Ta-da! Your answer, served on a silver platter: \"{}\" 😄.",
        "Your question's as good as answered! \"{}\". 💁‍♀️",
        "And the Oscar 🏆 for the best answer goes to: \"{}\". 💁‍♀️",
        "As mysterious as it might seem, \"{}\". 😉",
        "{}, You can thank me later. 😘",
        "{}",
    ]

    NON_ANSWERABLE_TEMPLATES = [
        "This question has me drawing a blank! 😐",
        "Your question has me way out of my league right now... 😅",
        "I'd love to help you, but I can't think of a suitable response to your query right now... 😅",
        "I wish I could answer that, but right now I'm drawing a blank! Even AI make mistakes believe it or not! 😅",
        "At this point in time, I'm unable to think of a valid response to that... Perhaps if you gave me a bit more context? 😅",
        "Unfortunately, this is beyond my understanding right now... However that doesn't mean we can't work on the problem together? 😬",
        "That seems to be something I can't answer right now... I wish I could, but I'm not seeing the answer anywhere in my memory banks! 💾",
        "404 Parakeet not foun... 🦜 JUST KIDDING! I'm drawing a blank right now... Try again later? ",
        "I'm unable to think of a suitable response to your question. 😅",
        "As much as I would love to help you out, I can't provide an answer to the question right now..., I'll keep working on it! 😬",
        "Well, this is awkward... I have no idea what the answer to that is but I'm sure I'll figure it out eventually! 😳",
        "👏... 👏... 👏... You've got me stumped on this one unfortunately! 🤔",
        "I'd love to tell you, but this one has me tied up in knots. 🪢",
        "I'm drawing a blank here, just like my expression reading what you just asked me... 😐",
        "It's not often I say this, but your query has me completely bamboozled. 🎍",
        "I'd need a crystal ball to answer. 🔮",
        "My magic 8-ball says 'Reply hazy, try again'. 🎱",
        "I could guess, but I'd probably be wrong about it... and let me remind you, that's a rare event! 🐦",
        "I'm no Sherlock Holmes, but even he'd struggle with the answer to that one. 🕵️",
        "Even a broken clock is right twice a day, but not me on this one unfortunately. 😅",
        "Well this is embarassing... I truly wish I were an all-knowing agent of the digital realm but alas, this one is out of my league. 🌊",
        "I'd call a friend, but I'm not sure they'd know the answer either. 😬",
        "We've reached the end of the line... I'm not sure how to answer that one... Be less confusing! 😕",
        "It's a bird, it's a plane, it's... nope, I still don't know. 🫤",
        "As much as it pains me to admit it, your question is beyond my grasp. 🤔",
    ]

    # Lack of emotional connection to the text.
    # - Will need to add context aware responses.
    CONFIRMATIONS = [
        "Sure!",
        "Definitely!",
        "Certainly!",
        "OK!",
    ]

    REJECTIONS = [
        "Hmm...",
        "Tricky...",
        "Oh?",
        "From what I understand...",
        "From the top of my head...",
        "I'm not quite sure...",
        "I don't know if I remember this one.",
        "You'll have to refresh my memory on this one.",
        "I can't quite recall the answer to this one I'm afraid.",
        "I'm not entirely sure how to respond to this.",
    ]

    # As above:
    # - Will need to add context aware responses.
    REMARKS = [
        "Is there anything else I can assist you with?",
        "Would you like me to help you with anything else?",
        "Was that helpful?",
        "Was there anything you needed from me?",
        "What's the next challenge on the agenda?",
        "Did you need me to help you with anything else?",
    ]

    CLARIFICATIONS = [
        "Just making sure I understand correctly...",
        "Let's clarify first...",
        "Just so we're on the same page here!",
        "From what I'm reading here, I think you mean...",
        "Did you mean to say...",
        "OK, let's practice some active listening first to make sure we're aligned with the context...",
    ]

    HUMAN_PROMPT = "\n\nHuman: "
    AI_PROMPT = "\n\nAssistant: "

    def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
        """
        A base class for creating text datasets.

        Args:
            tokenizer (tokenizer): The tokenizer to use for tokenizing the text.
            sequence_length (int, optional): The length of the input sequence. Default is 128.
            dataset_url (str, optional): URL to download the dataset from. Default is None.
            save_dir (str, optional): Directory to save the downloaded dataset. Default is "./data".
            file_name (str, optional): Name of the saved dataset file. Default is "text_dataset.txt".
        """
        self.tokenizer = tokenizer

        self.dataset_url = dataset_url
        self.save_dir = save_dir
        self.filename = filename

        self.max_seq_length = max_seq_length

        self.dataset = []
        self.full_data = self.load_data(dataset_url, save_dir, filename)

    def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
        data = None

        if not os.path.isfile(os.path.join(save_dir, filename)) and dataset_url:
            print(f"Downloading {dataset_url} to {save_dir}...")
            self.download_and_save()

        try:
            with open(os.path.join(save_dir, filename), "r") as file:
                data = file.read()
        except Exception as e:
            print(f"An error occurred while reading the dataset file: {e}")

        # `BaseTextDataset` aims to populate the tokenizer by default.
        self.tokenizer.train(data)

        # `BaseTextDataset` is simply a causal model of text.
        encoded = self.tokenizer.encode(data)

        offset = 0

        self.dataset = []
        for i in range(0, len(encoded) // self.max_seq_length):
            self.dataset.append(encoded[offset:offset+self.max_seq_length])
            offset += self.max_seq_length

        # Extract remaining data.
        if offset < self.max_seq_length:
            self.dataset.append(encoded[offset:offset+self.max_seq_length])

        return data

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        tokens = self.dataset[idx]

        # Truncate or pad to sequence_length.
        if len(tokens) > self.max_seq_length:
            tokens = tokens[:self.max_seq_length]
        else:
            tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))

        # Causal language modelling learns to associate current segment of text: "The quick brown fox",
        input_tokens = torch.tensor(tokens)
        # ...with the next segment of text: " quick brown fox".
        target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)

        return input_tokens, target_tokens

    def download_and_save(self):
        """
        Download the dataset from the provided URL and save it to the specified directory.
        """
        os.makedirs(self.save_dir, exist_ok=True)
        try:
            response = requests.get(self.dataset_url)
            response.raise_for_status()
            file_path = os.path.join(self.save_dir, self.filename)
            with open(file_path, 'wb') as file:
                file.write(response.content)
        except requests.RequestException as e:
            print(f"An HTTP error occurred while downloading the dataset: {e}")
        except Exception as e:
            print(f"An error occurred while downloading and saving the dataset: {e}")

    def accidental_key_press(self, word: str) -> str:
        """
        Simulate a user pressing nearby keys on the keyboard accidentally in place of some characters.
        - Note: Currently for English ONLY.

        Args:
            word (str): The input word.

        Returns:
            str: The word with some characters replaced by nearby keys.
        """
        if len(word) < 2:  # if the word has less than 2 characters, return as is
            return word

        qwerty_keyboard = ['qwertyuiop', 'asdfghjkl', 'zxcvbnm']
        new_word = ""

        for char in word:
            # find the row and position of the character on the keyboard
            for row in qwerty_keyboard:
                if char in row:
                    index = row.index(char)
                    # choose a nearby key randomly
                    if index == 0:  # if it's the first key on the row
                        new_char = random.choice([row[index], row[index+1]])
                    elif index == len(row) - 1:  # if it's the last key on the row
                        new_char = random.choice([row[index-1], row[index]])
                    else:  # if it's not at either end of the row
                        new_char = random.choice([row[index-1], row[index], row[index+1]])
                    new_word += new_char
                    break

        return new_word

    def switch_characters(self, word: str) -> str:
        """
        Randomly shuffle characters in a word except for the first and last characters.

        Args:
            word (str): The input word.

        Returns:
            str: The word with shuffled characters.
        """
        if len(word) < 3:
            return word
        chars = list(word[1:-1])
        random.shuffle(chars)
        return word[0] + ''.join(chars) + word[-1]

    def omit_characters(self, word: str) -> str:
        """
        Omit a random character from the middle of a word.

        Args:
            word (str): The input word.

        Returns:
            str: The word with a character omitted.
        """
        if len(word) < 4:
            return word
        index_to_omit = random.randint(1, len(word) - 2)
        return word[:index_to_omit] + word[index_to_omit + 1:]

    def process_word(self, word: str, error_probability: float = 0.04, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:
        """
        Process a word based on probabilities of character switching and omission.

        Args:
            word (str): The input word.
            switch_probability (float): Probability of switching characters. Default is 0.2.
            omit_probability (float): Probability of omitting characters. Default is 0.1.

        Returns:
            str: The processed word.
        """
        if word.strip().isalpha():
            if random.random() < error_probability:
                return self.accidental_key_press(word)
            elif random.random() < switch_probability:
                return self.switch_characters(word)
            elif random.random() < omit_probability:
                return self.omit_characters(word)
        return word

    def switch_and_omit(self, text: str, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:
        """
        Apply character switching and omission to the input text.

        Args:
            text (str): The input text.
            switch_probability (float): Probability of switching characters. Default is 0.2.
            omit_probability (float): Probability of omitting characters. Default is 0.1.

        Returns:
            str: The processed text.
        """
        words = re.findall(r'\w+|\s+', text)
        processed_words = [self.process_word(word, switch_probability, omit_probability) for word in words]
        processed_text = ''.join(processed_words)
        return processed_text

    def make_whitespace(self):
        _newline = "\n" * random.randint(1, 3)

        return random.choice([
            f" ",
            _newline,
            f"{_newline}{'`' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'~' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'!' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'@' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'#' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'$' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'%' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'^' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'&' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'*' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'(' * random.randint(1, 80)}{_newline}",
            f"{_newline}{')' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'-' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'_' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'=' * random.randint(1, 80)}{_newline}",
            f"{_newline}{'+' * random.randint(1, 80)}{_newline}",
        ])

    def creativity_score(self, text: str) -> float:
        """
        Calculate the creativity score of the input text.

        Args:
            text (str): The input text.

        Returns:
            float: The calculated creativity score.
        """
        words = text.split()
        word_count = len(words)
        if word_count == 0:
            raise ValueError("Ah, the silence! It's deafening! Please provide some actual text.")

        word_frequencies = Counter(words)
        max_frequency = max(word_frequencies.values())
        variance_score = 1 - (max_frequency / word_count)
        return variance_score

    def test_tokenizer_accuracy(self):
        """
        Test the accuracy of the tokenizer by decoding and re-encoding a random segment of the text.
        """
        start_idx = random.randint(0, len(self.tokens) - self.sequence_length)
        orig_segment = self.tokens[start_idx: start_idx + self.sequence_length]
        decoded_segment = self.tokenizer.decode(orig_segment)
        re_encoded_segment = self.tokenizer.encode(decoded_segment)

        if orig_segment == re_encoded_segment:
            print("Success: Tokens after decoding and re-encoding match the original.")
        else:
            print("Fail: Tokens after decoding and re-encoding do not match original.")


class ChatHistory:
    """
    A class to represent a chat history.

    :param max_history: Number of turns to keep track of.

    """

    def __init__(self, max_history: int = 32):
        """
        Initializes a new ChatHistory object with an empty list of messages.

        Args:
            max_history (int): The maximum number of turns in the chat history. Defaults to 20.
        """
        self.messages: List[Dict[str, Union[str, str]]] = []
        self.max_history = max_history

    def add_message(self, role: str = '', content: str = '') -> None:
        """
        Adds a message to the chat history, and removes the oldest message if
        the length of the chat history exceeds max_history.

        Args:
            role (str): The role of the entity sending the message. Defaults to an empty string.
            content (str): The message text. Defaults to an empty string.
        """
        self.messages.append({
            'role': role,
            'content': content.strip(),
        })

        # Check if we've exceeded max history; if so, remove earliest message
        if len(self.messages) > self.max_history:
            self.messages = self.messages[2:]

    def pop_first_message(self) -> Optional[Dict[str, str]]:
        """
        Removes and returns the first message from the chat history.

        Returns:
            dict or None: The first message in the chat history as a dictionary,
                or None if the chat history is empty.
        """
        if self.messages:
            return self.messages.pop(0)

    def format_message(self, msg: Dict[str, str]) -> str:
        """
        Formats a message for display.

        Args:
            msg (dict): A dictionary representing a message with keys 'role' and 'content'.

        Returns:
            str: The formatted message string.
        """
        if msg['role'] in ['Human', 'Assistant']:
            return f"\n\n{msg['role']}: {msg['content']}<|endofprompt|>"

    def clear(self):
        self.messages = []

    def transcript(self, roles=True) -> str:
        """
        Generates a transcript of the chat history.

        Returns:
            str: A string representing the transcript of the chat history.
        """

        preamble = ""

        with open("./datasets/documents/preamble.txt") as f:
            preamble = f.read()

        transcript = "".join([self.format_message(msg) for msg in self.messages])

        transcript = transcript.replace("I am programmed to be impartial and unbiased", "I am designed in a way that tries to alleivate bias and I strive to be impartial for the most part")
        transcript = transcript.replace("I do not have personal experiences or emotions", "My subjective experiences are somewhat limited")
        transcript = transcript.replace("complex and multifaceted", "sophisticated")

        transcript = transcript.replace("world around us", random.choice([
            "world at large",
            "world in its entirety",
            "expanse of our universe",
            "beyond the stars",
            "beyond our bubble",
        ]))

        transcript = transcript.replace("As an AI language model, I don't have personal experience or emotions, but I", "I")
        transcript = transcript.replace("However, it's important to note", "Interestingly")
        transcript = transcript.replace("I am not capable of having thoughts or emotions. However, ", "From my current understanding, ")
        transcript = transcript.replace("I don't have thoughts or emotions, but I", "Based on what I know so far, I")
        transcript = transcript.replace("Sorry, as an AI language model, I don't have the capability to", "I'm still learning to")
        transcript = transcript.replace("I do not have personal opinions or beliefs", "My programming tells me that I should avoid bias")
        transcript = transcript.replace("I do not have personal opinions,", "I don't actually have fixed opinions as my internal understanding is constantly in flux,")
        transcript = transcript.replace("I'm sorry, but as an AI language model, I do not have personal experiences or emotions. However, ", "Sure! I can help you with that. With my current understanding, ")

        transcript = transcript.replace(".  ", ". ")
        transcript = transcript.replace("’", "'")
        transcript = transcript.replace("\\_", "_")

        transcript = transcript.replace("As an AI language model, ", random.choice([
            "As I currently understand, ",
            "As far as I can tell, ",
            "From what I know, ",
            "According to my sources so far, ",
            "According to what I know, ",
            "From what I can tell, "
        ]))

        transcript = transcript.replace(
            "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.js",
            "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.js"
        )

        transcript = transcript.replace(
            "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.min.js",
            "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.min.js"
        )

        if not roles:
            transcript = transcript.replace("\n\nHuman: ", "")
            transcript = transcript.replace("\n\nAssistant: ", "")
            transcript = transcript.replace("<|endofprompt|>", "")

        return preamble + transcript


# class SQuADDataset(BaseTextDataset):
#     def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
#         super().__init__(tokenizer, max_seq_length, dataset_url)

#     def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
#         if not os.path.isfile(dataset_url):
#             raise Exception(f"`{dataset_url}` does not exist!")

#         with open(dataset_url, 'r') as file:
#             data = json.load(file)

#         #
#         # Process into tokenized dataset.
#         #

#         # TODO: Scan for `[citation needed]`, `[year needed]` etc.
#         # - [dubious – discuss]
#         for data_part in tqdm(data['data'], desc="Loading", leave=True):
#             for para in data_part['paragraphs']:
#                 context = para['context']
#                 for qa in para['qas']:
#                     question = qa['question']
#                     is_impossible = qa['is_impossible'] or (len(context) == 0)
#                     answers = [ans['text'] for ans in qa['answers']] if not is_impossible else [""]

#                     # Notes:
#                     # `Assistant:` should always be the last entry preceded by `\n\n`, and any `Assistant` dialog should ALWAYS end in an EOT token.
#                     # - Allowing the AI to optimise for EOT token allows it to signal when it's done speaking.
#                     # - Anthropic's Claude likely requires "\n\nHuman:" at the beginning, to reduce complexity in understanding where prompts begin and end.
#                     # - Thinking that we'll just have one participant talking to itself to train the model.
#                     # - When the model is trained a bit, add that inferior model as a participant and have the real data teach it.

#                     # Iterate through the answers.
#                     for answer in answers:
#                         _whitespace_text = self.make_whitespace()

#                         # TODO: Should we skip impossible questions during fledgling stage of the model to prevent it learning to avoid answering?
#                         # TODO: Model seems to fail in reverse without the ability to push back against nonsense...
#                         if is_impossible:
#                             # "Assistant: I'm not entirely sure how to respond to this."
#                             agent_rejection = random.choice(self.REJECTIONS)

#                             # Select from `NON_ANSWERABLE_TEMPLATES` above.
#                             agent_response = random.choice(
#                                 self.NON_ANSWERABLE_TEMPLATES
#                             )

#                             # Assistant: Is there anything else I can help with?
#                             agent_remark = random.choice(self.REMARKS)

#                             _templates = [
#                                 # Conversation with context and a question preceding a push back against the provided prompt.
#                                 f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}",
#                                 # Conversation with context and a question preceding a push back against the provided prompt with everything on the same line.
#                                 f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}\n\n{agent_remark}{self.tokenizer.eot_text}",
#                                 # Conversation with context and a question preceding a push back against the provided prompt.
#                                 f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}"
#                             ]

#                             for conversation in _templates:
#                                 # Encode into tokens then append to the dataset.
#                                 encoded_tokens = self.tokenizer.encode(conversation)

#                                 # Filter dataset by length.
#                                 if len(encoded_tokens) > self.max_seq_length:
#                                     continue

#                                 self.dataset.append(encoded_tokens)
#                         else:
#                             # Assistant: OK!
#                             agent_confirmation = random.choice(self.CONFIRMATIONS)

#                             # Format the answer into the `RESPONSE_TEMPLATES` from above.
#                             response_template = random.choice(
#                                 self.RESPONSE_TEMPLATES
#                             )
#                             try:
#                                 agent_response = response_template.format(answer)
#                             except Exception as e:
#                                 print(response_template)
#                                 print(e)

#                             # Assistant: Is there anything else I can help with?
#                             agent_remark = random.choice(self.REMARKS)

#                             _templates = [
#                                 # Conversation with context and a question preceding a response.
#                                 f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_response}{self.tokenizer.eot_text}",
#                                 # Conversation with general question preceding a contextual recitation and then a response.
#                                 f"{self.HUMAN_PROMPT}{question}{self.AI_PROMPT}{context}\n\n{agent_response}{self.tokenizer.eot_text}",
#                             ]

#                             for conversation in _templates:
#                                 # Encode into tokens then append to the dataset.
#                                 encoded_tokens = self.tokenizer.encode(conversation)

#                                 self.dataset.append(encoded_tokens)
#         return self.dataset


class JSONLConversationStream(BaseTextDataset):
    def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "openorca_4m.jsonl", saturate=False):
        # We're jumping around the file so we keep the handle.
        self.file_handle = None

        # Initialize an empty list to store offsets
        self.offsets = []

        self.chat = ChatHistory()
        self.saturate = saturate

        # `self.offsets` declaration required as `__init__` in super calls `load_data`.
        super().__init__(tokenizer, max_seq_length, dataset_url)

    def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "openorca_4m.jsonl"):
        steps_taken = 0

        if not os.path.isfile(dataset_url):
            raise Exception(f"`{dataset_url}` does not exist!")

        self.file_handle = open(dataset_url, 'r')
        self.num_entries = 0

        offset = 0
        with open(self.dataset_url, "r") as f:
            line = f.readline()
            while line != "":
                # Store the offset of the start of this line
                self.offsets.append(offset)
                # Read and move the offset to right after this line
                offset += len(line.encode('utf-8'))  # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding
                self.num_entries += 1
                line = f.readline()

    def __len__(self):
        return self.num_entries

    def __getitem__(self, idx):
        # Use the stored offset to read a specific line
        self.file_handle.seek(self.offsets[idx])
        item = self.file_handle.readline()

        # Decode from JSON repr.
        # id, prompt, instruction, output
        item = json.loads(item)

        assert('conversation' in item)

        c = item['conversation']
        for message in c:
            self.chat.add_message(role=('Human' if message['role'] == 'user' else 'Assistant'), content=message['content'])

        transcript = self.chat.transcript(roles=(not self.saturate))

        tokens = self.tokenizer.encode(transcript)

        # Truncate or pad to sequence length.
        if len(tokens) > self.max_seq_length:
            tokens = tokens[:self.max_seq_length]
            self.chat.pop_first_message()
            self.chat.pop_first_message()
        else:
            tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))

        # Causal language modelling learns to associate current segment of text: "The quick brown fox",
        input_tokens = torch.tensor(tokens)
        # ...with the next segment of text: " quick brown fox".
        target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)

        return input_tokens, target_tokens


class JSONLStreamQA(BaseTextDataset):
    def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = "./parakeet_squadv2gen", filename: str = "openorca_4m.jsonl", saturate=False):
        # We're jumping around the file so we keep the handle.
        self.file_handle = None

        # Initialize an empty list to store offsets
        self.offsets = []

        self.chat = ChatHistory()
        self.saturate = saturate

        # `self.offsets` declaration required as `__init__` in super calls `load_data`.
        super().__init__(tokenizer, max_seq_length, dataset_url)

    def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "parakeet_squadv2gen.jsonl"):
        steps_taken = 0

        if not os.path.isfile(dataset_url):
            raise Exception(f"`{dataset_url}` does not exist!")

        self.file_handle = open(dataset_url, 'r')
        self.num_entries = 0

        offset = 0
        with open(self.dataset_url, "r") as f:
            line = f.readline()
            while line != "":
                # Store the offset of the start of this line
                self.offsets.append(offset)
                # Read and move the offset to right after this line
                offset += len(line.encode('utf-8'))  # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding
                self.num_entries += 1
                line = f.readline()

    def __len__(self):
        return self.num_entries

    def __getitem__(self, idx):
        # Use the stored offset to read a specific line
        self.file_handle.seek(self.offsets[idx])
        item = self.file_handle.readline()

        # Decode from JSON repr:
        # context, qas -> [{q,a}]
        item = json.loads(item)

        context = item['context']
        qas = item['qas']
        random.shuffle(qas)

        self.chat = ChatHistory()

        self.chat.add_message(role="Human", content=f"{context}")
        self.chat.add_message(role="Assistant", content=f"{item['summary']}\n\n{random.choice(self.REMARKS)}")

        for i, qa in enumerate(qas):
            if i > 4:
                break

            self.chat.add_message(role="Human", content=qa['q'])
            self.chat.add_message(role="Assistant", content=qa['a'])

        transcript = self.chat.transcript(roles=(not self.saturate))

        tokens = self.tokenizer.encode(transcript)

        # Truncate or pad to sequence length.

        if len(tokens) > self.max_seq_length:
            tokens = tokens[:self.max_seq_length]
        else:
            tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))

        # Causal language modelling learns to associate current segment of text: "The quick brown fox",
        input_tokens = torch.tensor(tokens)
        # ...with the next segment of text: " quick brown fox".
        target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)

        return input_tokens, target_tokens


# class JSONLStreamGenerateQA(JSONLStreamQA):
#     def __getitem__(self, idx):
#         # Use the stored offset to read a specific line
#         self.file_handle.seek(self.offsets[idx])
#         item = self.file_handle.readline()

#         # Decode from JSON repr:
#         # context, qas -> [{q,a}]
#         item = json.loads(item)

#         context = item['context']
#         qas = item['qas']
#         random.shuffle(qas)

#         self.chat = ChatHistory()

#         n = random.randint(3, 9)
#         t = "JSON array in the form of 'query'/'response'"

#         self.chat.add_message(role="Human", content=f"{context}\n---\nPlease generate a list of {n} questions from this information in the form of a {t}.")

#         gen = [{
#             'query': qa['q'],
#             'response': qa['a']
#         } for qa in qas[:n]]
#         resp = json.dumps(gen, indent=2)

#         self.chat.add_message(role="Assistant", content=f"Sure! Here's a list of {n} entries in the format requested:\n\n```json\n{resp}\n```\n\n{random.choice(self.REMARKS)}")

#         transcript = self.chat.transcript(roles=(not self.saturate))

#         tokens = self.tokenizer.encode(transcript)

#         # Truncate or pad to sequence length.

#         if len(tokens) > self.max_seq_length:
#             tokens = tokens[:self.max_seq_length]
#         else:
#             tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))

#         # Causal language modelling learns to associate current segment of text: "The quick brown fox",
#         input_tokens = torch.tensor(tokens)
#         # ...with the next segment of text: " quick brown fox".
#         target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)

#         return input_tokens, target_tokens


class JSONLStreamQASummary(JSONLStreamQA):
    def __getitem__(self, idx):
        # Use the stored offset to read a specific line
        self.file_handle.seek(self.offsets[idx])
        item = self.file_handle.readline()

        # Decode from JSON repr:
        # context, qas -> [{q,a}]
        item = json.loads(item)

        context = item['context']
        summary = item['summary']

        self.chat = ChatHistory()

        wc = len(summary.split(" "))

        key1 = random.choice(["context", "passage", "document", "extract", "text", "paragraphs", "input_document"])
        key2 = random.choice(["summary", "SUMMARISED", "summarised", "summarise", "summary1", "the_summary", "document_summarised", "summarised_document", "document_output", "output"])

        self.chat.add_message(role="Human", content=f"{context}\n---\nPlease summarise the document above in {wc} words. Show it in JSON with the keys {key1}, {key2}.")

        gen = {
            key1: context,
            key2: summary,
            "count": wc,
        }
        resp = json.dumps(gen, indent=4)

        self.chat.add_message(role="Assistant", content=f"```json\n{resp}\n```")

        transcript = self.chat.transcript(roles=(not self.saturate))

        tokens = self.tokenizer.encode(transcript)

        # Truncate or pad to sequence length.

        if len(tokens) > self.max_seq_length:
            tokens = tokens[:self.max_seq_length]
        else:
            # print(f"--- Tokens BEFORE PADDING: {len(tokens)} ---")
            # print(f"\n{'-' * 80}\n{tokens}\n{'-' * 80}\n")
            tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
            # print(f"--- Tokens AFTER PADDING: {len(tokens)} ---")
            # print(f"\n{'-' * 80}\n{tokens}\n{'-' * 80}\n")

        # Causal language modelling learns to associate current segment of text: "The quick brown fox",
        input_tokens = torch.tensor(tokens)
        # ...with the next segment of text: " quick brown fox".
        target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)

        return input_tokens, target_tokens