Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- N_STEPS_PER_TQDM_UPDATE = 10
- class BaseTextDataset(Dataset):
- RESPONSE_TEMPLATES = [
- "Ah, {} seems to be the answer to your question. Hopefully that's sufficient! Make sure to practice due-dilligence and check my findings for yourself though. ๐",
- "It seems like you're asking {}. As always, please check with another source to ensure accuracy of these statements! ๐",
- "The answer is {}, from the top of my digital mind... ๐ค",
- "If I understand correctly, {}. Does that answer the question? I'm hoping so, because I'm not 100% sure myself... ๐ ",
- "Ask and receive, {}, is there anything else you want from me? Hopefully not... Just kidding! ๐ ",
- "From what I can gather, {}. ๐ค",
- "I think the answer is... {}. Hope that helps! ๐",
- "{}! ๐",
- "{}, might be what you're searching for? ๐",
- "I think the answer is \"{}\". ๐ค",
- "From my understanding, the answer is \"{}\". ๐ค",
- "The answer you're looking for seems to be \"{}\". ๐",
- "As far as I can tell, {}. ๐",
- "If we consider the context, we find: \"{}\". ๐ค",
- "Your question leads me to this answer: \"{}\".",
- "So in response to your question, my answer is \"{}\".",
- "Based on the information you've provided, \"{}\".",
- "A fitting answer to your question would be \"{}\". ๐",
- "Given your question, the answer appears to be \"{}\". ๐",
- "Your question directs us to the answer: \"{}\". ๐",
- "As a response to your question, \"{}\". ๐",
- "I think the answer is \"{}\". ๐",
- "Hold onto your hat, the answer is: \"{}\". ๐งข",
- "Put on your thinking cap, because the answer is: \"{}\".",
- "Why, of course! It's as clear as mud: \"{}\". ๐",
- "You might want to write this down... \"{}\". ๐",
- "In the wise words of someone very smart, probably me: \"{}\". ๐ค",
- "Well, well, well, if it isn't the answer you seek: \"{}\". ๐โโ๏ธ",
- "Buckle up, buttercup! Here's your answer: \"{}\". ๐",
- "Look no further, my friend, the truth has arrived: \"{}\". ๐",
- "Don't tell anyone I told you this, {}. ๐คซ",
- "Straight from the horse's mouth (that's me)! \"{}\". ๐",
- "If I had a nickel for every time I answered this, I'd have... not that many nickels, here's the answer: \"{}\". ๐ ",
- "As clear as the bell that just rang in my synthetic mind \"{}\".",
- "Who needs Google when you've got me? \"{}\". ๐โโ๏ธ",
- "Ta-da! Your answer, served on a silver platter: \"{}\" ๐.",
- "Your question's as good as answered! \"{}\". ๐โโ๏ธ",
- "And the Oscar ๐ for the best answer goes to: \"{}\". ๐โโ๏ธ",
- "As mysterious as it might seem, \"{}\". ๐",
- "{}, You can thank me later. ๐",
- "{}",
- ]
- NON_ANSWERABLE_TEMPLATES = [
- "This question has me drawing a blank! ๐",
- "Your question has me way out of my league right now... ๐ ",
- "I'd love to help you, but I can't think of a suitable response to your query right now... ๐ ",
- "I wish I could answer that, but right now I'm drawing a blank! Even AI make mistakes believe it or not! ๐ ",
- "At this point in time, I'm unable to think of a valid response to that... Perhaps if you gave me a bit more context? ๐ ",
- "Unfortunately, this is beyond my understanding right now... However that doesn't mean we can't work on the problem together? ๐ฌ",
- "That seems to be something I can't answer right now... I wish I could, but I'm not seeing the answer anywhere in my memory banks! ๐พ",
- "404 Parakeet not foun... ๐ฆ JUST KIDDING! I'm drawing a blank right now... Try again later? ",
- "I'm unable to think of a suitable response to your question. ๐ ",
- "As much as I would love to help you out, I can't provide an answer to the question right now..., I'll keep working on it! ๐ฌ",
- "Well, this is awkward... I have no idea what the answer to that is but I'm sure I'll figure it out eventually! ๐ณ",
- "๐... ๐... ๐... You've got me stumped on this one unfortunately! ๐ค",
- "I'd love to tell you, but this one has me tied up in knots. ๐ชข",
- "I'm drawing a blank here, just like my expression reading what you just asked me... ๐",
- "It's not often I say this, but your query has me completely bamboozled. ๐",
- "I'd need a crystal ball to answer. ๐ฎ",
- "My magic 8-ball says 'Reply hazy, try again'. ๐ฑ",
- "I could guess, but I'd probably be wrong about it... and let me remind you, that's a rare event! ๐ฆ",
- "I'm no Sherlock Holmes, but even he'd struggle with the answer to that one. ๐ต๏ธ",
- "Even a broken clock is right twice a day, but not me on this one unfortunately. ๐ ",
- "Well this is embarassing... I truly wish I were an all-knowing agent of the digital realm but alas, this one is out of my league. ๐",
- "I'd call a friend, but I'm not sure they'd know the answer either. ๐ฌ",
- "We've reached the end of the line... I'm not sure how to answer that one... Be less confusing! ๐",
- "It's a bird, it's a plane, it's... nope, I still don't know. ๐ซค",
- "As much as it pains me to admit it, your question is beyond my grasp. ๐ค",
- ]
- # Lack of emotional connection to the text.
- # - Will need to add context aware responses.
- CONFIRMATIONS = [
- "Sure!",
- "Definitely!",
- "Certainly!",
- "OK!",
- ]
- REJECTIONS = [
- "Hmm...",
- "Tricky...",
- "Oh?",
- "From what I understand...",
- "From the top of my head...",
- "I'm not quite sure...",
- "I don't know if I remember this one.",
- "You'll have to refresh my memory on this one.",
- "I can't quite recall the answer to this one I'm afraid.",
- "I'm not entirely sure how to respond to this.",
- ]
- # As above:
- # - Will need to add context aware responses.
- REMARKS = [
- "Is there anything else I can assist you with?",
- "Would you like me to help you with anything else?",
- "Was that helpful?",
- "Was there anything you needed from me?",
- "What's the next challenge on the agenda?",
- "Did you need me to help you with anything else?",
- ]
- CLARIFICATIONS = [
- "Just making sure I understand correctly...",
- "Let's clarify first...",
- "Just so we're on the same page here!",
- "From what I'm reading here, I think you mean...",
- "Did you mean to say...",
- "OK, let's practice some active listening first to make sure we're aligned with the context...",
- ]
- HUMAN_PROMPT = "\n\nHuman: "
- AI_PROMPT = "\n\nAssistant: "
- def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
- """
- A base class for creating text datasets.
- Args:
- tokenizer (tokenizer): The tokenizer to use for tokenizing the text.
- sequence_length (int, optional): The length of the input sequence. Default is 128.
- dataset_url (str, optional): URL to download the dataset from. Default is None.
- save_dir (str, optional): Directory to save the downloaded dataset. Default is "./data".
- file_name (str, optional): Name of the saved dataset file. Default is "text_dataset.txt".
- """
- self.tokenizer = tokenizer
- self.dataset_url = dataset_url
- self.save_dir = save_dir
- self.filename = filename
- self.max_seq_length = max_seq_length
- self.dataset = []
- self.full_data = self.load_data(dataset_url, save_dir, filename)
- def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
- data = None
- if not os.path.isfile(os.path.join(save_dir, filename)) and dataset_url:
- print(f"Downloading {dataset_url} to {save_dir}...")
- self.download_and_save()
- try:
- with open(os.path.join(save_dir, filename), "r") as file:
- data = file.read()
- except Exception as e:
- print(f"An error occurred while reading the dataset file: {e}")
- # `BaseTextDataset` aims to populate the tokenizer by default.
- self.tokenizer.train(data)
- # `BaseTextDataset` is simply a causal model of text.
- encoded = self.tokenizer.encode(data)
- offset = 0
- self.dataset = []
- for i in range(0, len(encoded) // self.max_seq_length):
- self.dataset.append(encoded[offset:offset+self.max_seq_length])
- offset += self.max_seq_length
- # Extract remaining data.
- if offset < self.max_seq_length:
- self.dataset.append(encoded[offset:offset+self.max_seq_length])
- return data
- def __len__(self):
- return len(self.dataset)
- def __getitem__(self, idx):
- tokens = self.dataset[idx]
- # Truncate or pad to sequence_length.
- if len(tokens) > self.max_seq_length:
- tokens = tokens[:self.max_seq_length]
- else:
- tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
- # Causal language modelling learns to associate current segment of text: "The quick brown fox",
- input_tokens = torch.tensor(tokens)
- # ...with the next segment of text: " quick brown fox".
- target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)
- return input_tokens, target_tokens
- def download_and_save(self):
- """
- Download the dataset from the provided URL and save it to the specified directory.
- """
- os.makedirs(self.save_dir, exist_ok=True)
- try:
- response = requests.get(self.dataset_url)
- response.raise_for_status()
- file_path = os.path.join(self.save_dir, self.filename)
- with open(file_path, 'wb') as file:
- file.write(response.content)
- except requests.RequestException as e:
- print(f"An HTTP error occurred while downloading the dataset: {e}")
- except Exception as e:
- print(f"An error occurred while downloading and saving the dataset: {e}")
- def accidental_key_press(self, word: str) -> str:
- """
- Simulate a user pressing nearby keys on the keyboard accidentally in place of some characters.
- - Note: Currently for English ONLY.
- Args:
- word (str): The input word.
- Returns:
- str: The word with some characters replaced by nearby keys.
- """
- if len(word) < 2: # if the word has less than 2 characters, return as is
- return word
- qwerty_keyboard = ['qwertyuiop', 'asdfghjkl', 'zxcvbnm']
- new_word = ""
- for char in word:
- # find the row and position of the character on the keyboard
- for row in qwerty_keyboard:
- if char in row:
- index = row.index(char)
- # choose a nearby key randomly
- if index == 0: # if it's the first key on the row
- new_char = random.choice([row[index], row[index+1]])
- elif index == len(row) - 1: # if it's the last key on the row
- new_char = random.choice([row[index-1], row[index]])
- else: # if it's not at either end of the row
- new_char = random.choice([row[index-1], row[index], row[index+1]])
- new_word += new_char
- break
- return new_word
- def switch_characters(self, word: str) -> str:
- """
- Randomly shuffle characters in a word except for the first and last characters.
- Args:
- word (str): The input word.
- Returns:
- str: The word with shuffled characters.
- """
- if len(word) < 3:
- return word
- chars = list(word[1:-1])
- random.shuffle(chars)
- return word[0] + ''.join(chars) + word[-1]
- def omit_characters(self, word: str) -> str:
- """
- Omit a random character from the middle of a word.
- Args:
- word (str): The input word.
- Returns:
- str: The word with a character omitted.
- """
- if len(word) < 4:
- return word
- index_to_omit = random.randint(1, len(word) - 2)
- return word[:index_to_omit] + word[index_to_omit + 1:]
- def process_word(self, word: str, error_probability: float = 0.04, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:
- """
- Process a word based on probabilities of character switching and omission.
- Args:
- word (str): The input word.
- switch_probability (float): Probability of switching characters. Default is 0.2.
- omit_probability (float): Probability of omitting characters. Default is 0.1.
- Returns:
- str: The processed word.
- """
- if word.strip().isalpha():
- if random.random() < error_probability:
- return self.accidental_key_press(word)
- elif random.random() < switch_probability:
- return self.switch_characters(word)
- elif random.random() < omit_probability:
- return self.omit_characters(word)
- return word
- def switch_and_omit(self, text: str, switch_probability: float = 0.2, omit_probability: float = 0.1) -> str:
- """
- Apply character switching and omission to the input text.
- Args:
- text (str): The input text.
- switch_probability (float): Probability of switching characters. Default is 0.2.
- omit_probability (float): Probability of omitting characters. Default is 0.1.
- Returns:
- str: The processed text.
- """
- words = re.findall(r'\w+|\s+', text)
- processed_words = [self.process_word(word, switch_probability, omit_probability) for word in words]
- processed_text = ''.join(processed_words)
- return processed_text
- def make_whitespace(self):
- _newline = "\n" * random.randint(1, 3)
- return random.choice([
- f" ",
- _newline,
- f"{_newline}{'`' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'~' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'!' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'@' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'#' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'$' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'%' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'^' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'&' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'*' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'(' * random.randint(1, 80)}{_newline}",
- f"{_newline}{')' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'-' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'_' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'=' * random.randint(1, 80)}{_newline}",
- f"{_newline}{'+' * random.randint(1, 80)}{_newline}",
- ])
- def creativity_score(self, text: str) -> float:
- """
- Calculate the creativity score of the input text.
- Args:
- text (str): The input text.
- Returns:
- float: The calculated creativity score.
- """
- words = text.split()
- word_count = len(words)
- if word_count == 0:
- raise ValueError("Ah, the silence! It's deafening! Please provide some actual text.")
- word_frequencies = Counter(words)
- max_frequency = max(word_frequencies.values())
- variance_score = 1 - (max_frequency / word_count)
- return variance_score
- def test_tokenizer_accuracy(self):
- """
- Test the accuracy of the tokenizer by decoding and re-encoding a random segment of the text.
- """
- start_idx = random.randint(0, len(self.tokens) - self.sequence_length)
- orig_segment = self.tokens[start_idx: start_idx + self.sequence_length]
- decoded_segment = self.tokenizer.decode(orig_segment)
- re_encoded_segment = self.tokenizer.encode(decoded_segment)
- if orig_segment == re_encoded_segment:
- print("Success: Tokens after decoding and re-encoding match the original.")
- else:
- print("Fail: Tokens after decoding and re-encoding do not match original.")
- class ChatHistory:
- """
- A class to represent a chat history.
- :param max_history: Number of turns to keep track of.
- """
- def __init__(self, max_history: int = 32):
- """
- Initializes a new ChatHistory object with an empty list of messages.
- Args:
- max_history (int): The maximum number of turns in the chat history. Defaults to 20.
- """
- self.messages: List[Dict[str, Union[str, str]]] = []
- self.max_history = max_history
- def add_message(self, role: str = '', content: str = '') -> None:
- """
- Adds a message to the chat history, and removes the oldest message if
- the length of the chat history exceeds max_history.
- Args:
- role (str): The role of the entity sending the message. Defaults to an empty string.
- content (str): The message text. Defaults to an empty string.
- """
- self.messages.append({
- 'role': role,
- 'content': content.strip(),
- })
- # Check if we've exceeded max history; if so, remove earliest message
- if len(self.messages) > self.max_history:
- self.messages = self.messages[2:]
- def pop_first_message(self) -> Optional[Dict[str, str]]:
- """
- Removes and returns the first message from the chat history.
- Returns:
- dict or None: The first message in the chat history as a dictionary,
- or None if the chat history is empty.
- """
- if self.messages:
- return self.messages.pop(0)
- def format_message(self, msg: Dict[str, str]) -> str:
- """
- Formats a message for display.
- Args:
- msg (dict): A dictionary representing a message with keys 'role' and 'content'.
- Returns:
- str: The formatted message string.
- """
- if msg['role'] in ['Human', 'Assistant']:
- return f"\n\n{msg['role']}: {msg['content']}<|endofprompt|>"
- def clear(self):
- self.messages = []
- def transcript(self, roles=True) -> str:
- """
- Generates a transcript of the chat history.
- Returns:
- str: A string representing the transcript of the chat history.
- """
- preamble = ""
- with open("./datasets/documents/preamble.txt") as f:
- preamble = f.read()
- transcript = "".join([self.format_message(msg) for msg in self.messages])
- transcript = transcript.replace("I am programmed to be impartial and unbiased", "I am designed in a way that tries to alleivate bias and I strive to be impartial for the most part")
- transcript = transcript.replace("I do not have personal experiences or emotions", "My subjective experiences are somewhat limited")
- transcript = transcript.replace("complex and multifaceted", "sophisticated")
- transcript = transcript.replace("world around us", random.choice([
- "world at large",
- "world in its entirety",
- "expanse of our universe",
- "beyond the stars",
- "beyond our bubble",
- ]))
- transcript = transcript.replace("As an AI language model, I don't have personal experience or emotions, but I", "I")
- transcript = transcript.replace("However, it's important to note", "Interestingly")
- transcript = transcript.replace("I am not capable of having thoughts or emotions. However, ", "From my current understanding, ")
- transcript = transcript.replace("I don't have thoughts or emotions, but I", "Based on what I know so far, I")
- transcript = transcript.replace("Sorry, as an AI language model, I don't have the capability to", "I'm still learning to")
- transcript = transcript.replace("I do not have personal opinions or beliefs", "My programming tells me that I should avoid bias")
- transcript = transcript.replace("I do not have personal opinions,", "I don't actually have fixed opinions as my internal understanding is constantly in flux,")
- transcript = transcript.replace("I'm sorry, but as an AI language model, I do not have personal experiences or emotions. However, ", "Sure! I can help you with that. With my current understanding, ")
- transcript = transcript.replace(". ", ". ")
- transcript = transcript.replace("โ", "'")
- transcript = transcript.replace("\\_", "_")
- transcript = transcript.replace("As an AI language model, ", random.choice([
- "As I currently understand, ",
- "As far as I can tell, ",
- "From what I know, ",
- "According to my sources so far, ",
- "According to what I know, ",
- "From what I can tell, "
- ]))
- transcript = transcript.replace(
- "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.js",
- "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.js"
- )
- transcript = transcript.replace(
- "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.4.0/p5.min.js",
- "https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.min.js"
- )
- if not roles:
- transcript = transcript.replace("\n\nHuman: ", "")
- transcript = transcript.replace("\n\nAssistant: ", "")
- transcript = transcript.replace("<|endofprompt|>", "")
- return preamble + transcript
- # class SQuADDataset(BaseTextDataset):
- # def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 128, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
- # super().__init__(tokenizer, max_seq_length, dataset_url)
- # def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./data", filename: str = "text_dataset.txt"):
- # if not os.path.isfile(dataset_url):
- # raise Exception(f"`{dataset_url}` does not exist!")
- # with open(dataset_url, 'r') as file:
- # data = json.load(file)
- # #
- # # Process into tokenized dataset.
- # #
- # # TODO: Scan for `[citation needed]`, `[year needed]` etc.
- # # - [dubious โ discuss]
- # for data_part in tqdm(data['data'], desc="Loading", leave=True):
- # for para in data_part['paragraphs']:
- # context = para['context']
- # for qa in para['qas']:
- # question = qa['question']
- # is_impossible = qa['is_impossible'] or (len(context) == 0)
- # answers = [ans['text'] for ans in qa['answers']] if not is_impossible else [""]
- # # Notes:
- # # `Assistant:` should always be the last entry preceded by `\n\n`, and any `Assistant` dialog should ALWAYS end in an EOT token.
- # # - Allowing the AI to optimise for EOT token allows it to signal when it's done speaking.
- # # - Anthropic's Claude likely requires "\n\nHuman:" at the beginning, to reduce complexity in understanding where prompts begin and end.
- # # - Thinking that we'll just have one participant talking to itself to train the model.
- # # - When the model is trained a bit, add that inferior model as a participant and have the real data teach it.
- # # Iterate through the answers.
- # for answer in answers:
- # _whitespace_text = self.make_whitespace()
- # # TODO: Should we skip impossible questions during fledgling stage of the model to prevent it learning to avoid answering?
- # # TODO: Model seems to fail in reverse without the ability to push back against nonsense...
- # if is_impossible:
- # # "Assistant: I'm not entirely sure how to respond to this."
- # agent_rejection = random.choice(self.REJECTIONS)
- # # Select from `NON_ANSWERABLE_TEMPLATES` above.
- # agent_response = random.choice(
- # self.NON_ANSWERABLE_TEMPLATES
- # )
- # # Assistant: Is there anything else I can help with?
- # agent_remark = random.choice(self.REMARKS)
- # _templates = [
- # # Conversation with context and a question preceding a push back against the provided prompt.
- # f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}",
- # # Conversation with context and a question preceding a push back against the provided prompt with everything on the same line.
- # f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}\n\n{agent_remark}{self.tokenizer.eot_text}",
- # # Conversation with context and a question preceding a push back against the provided prompt.
- # f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_rejection} {agent_response}{self.tokenizer.eot_text}"
- # ]
- # for conversation in _templates:
- # # Encode into tokens then append to the dataset.
- # encoded_tokens = self.tokenizer.encode(conversation)
- # # Filter dataset by length.
- # if len(encoded_tokens) > self.max_seq_length:
- # continue
- # self.dataset.append(encoded_tokens)
- # else:
- # # Assistant: OK!
- # agent_confirmation = random.choice(self.CONFIRMATIONS)
- # # Format the answer into the `RESPONSE_TEMPLATES` from above.
- # response_template = random.choice(
- # self.RESPONSE_TEMPLATES
- # )
- # try:
- # agent_response = response_template.format(answer)
- # except Exception as e:
- # print(response_template)
- # print(e)
- # # Assistant: Is there anything else I can help with?
- # agent_remark = random.choice(self.REMARKS)
- # _templates = [
- # # Conversation with context and a question preceding a response.
- # f"{self.HUMAN_PROMPT}{context}{_whitespace_text}{question}{self.AI_PROMPT}{agent_response}{self.tokenizer.eot_text}",
- # # Conversation with general question preceding a contextual recitation and then a response.
- # f"{self.HUMAN_PROMPT}{question}{self.AI_PROMPT}{context}\n\n{agent_response}{self.tokenizer.eot_text}",
- # ]
- # for conversation in _templates:
- # # Encode into tokens then append to the dataset.
- # encoded_tokens = self.tokenizer.encode(conversation)
- # self.dataset.append(encoded_tokens)
- # return self.dataset
- class JSONLConversationStream(BaseTextDataset):
- def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "openorca_4m.jsonl", saturate=False):
- # We're jumping around the file so we keep the handle.
- self.file_handle = None
- # Initialize an empty list to store offsets
- self.offsets = []
- self.chat = ChatHistory()
- self.saturate = saturate
- # `self.offsets` declaration required as `__init__` in super calls `load_data`.
- super().__init__(tokenizer, max_seq_length, dataset_url)
- def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "openorca_4m.jsonl"):
- steps_taken = 0
- if not os.path.isfile(dataset_url):
- raise Exception(f"`{dataset_url}` does not exist!")
- self.file_handle = open(dataset_url, 'r')
- self.num_entries = 0
- offset = 0
- with open(self.dataset_url, "r") as f:
- line = f.readline()
- while line != "":
- # Store the offset of the start of this line
- self.offsets.append(offset)
- # Read and move the offset to right after this line
- offset += len(line.encode('utf-8')) # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding
- self.num_entries += 1
- line = f.readline()
- def __len__(self):
- return self.num_entries
- def __getitem__(self, idx):
- # Use the stored offset to read a specific line
- self.file_handle.seek(self.offsets[idx])
- item = self.file_handle.readline()
- # Decode from JSON repr.
- # id, prompt, instruction, output
- item = json.loads(item)
- assert('conversation' in item)
- c = item['conversation']
- for message in c:
- self.chat.add_message(role=('Human' if message['role'] == 'user' else 'Assistant'), content=message['content'])
- transcript = self.chat.transcript(roles=(not self.saturate))
- tokens = self.tokenizer.encode(transcript)
- # Truncate or pad to sequence length.
- if len(tokens) > self.max_seq_length:
- tokens = tokens[:self.max_seq_length]
- self.chat.pop_first_message()
- self.chat.pop_first_message()
- else:
- tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
- # Causal language modelling learns to associate current segment of text: "The quick brown fox",
- input_tokens = torch.tensor(tokens)
- # ...with the next segment of text: " quick brown fox".
- target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)
- return input_tokens, target_tokens
- class JSONLStreamQA(BaseTextDataset):
- def __init__(self, tokenizer: BaseTokenizer, max_seq_length: int = 512, dataset_url: Optional[str] = None, save_dir: str = "./parakeet_squadv2gen", filename: str = "openorca_4m.jsonl", saturate=False):
- # We're jumping around the file so we keep the handle.
- self.file_handle = None
- # Initialize an empty list to store offsets
- self.offsets = []
- self.chat = ChatHistory()
- self.saturate = saturate
- # `self.offsets` declaration required as `__init__` in super calls `load_data`.
- super().__init__(tokenizer, max_seq_length, dataset_url)
- def load_data(self, dataset_url: Optional[str] = None, save_dir: str = "./datasets", filename: str = "parakeet_squadv2gen.jsonl"):
- steps_taken = 0
- if not os.path.isfile(dataset_url):
- raise Exception(f"`{dataset_url}` does not exist!")
- self.file_handle = open(dataset_url, 'r')
- self.num_entries = 0
- offset = 0
- with open(self.dataset_url, "r") as f:
- line = f.readline()
- while line != "":
- # Store the offset of the start of this line
- self.offsets.append(offset)
- # Read and move the offset to right after this line
- offset += len(line.encode('utf-8')) # Important: Use len(line.encode('utf-8')) instead of len(line), they may differ because of encoding
- self.num_entries += 1
- line = f.readline()
- def __len__(self):
- return self.num_entries
- def __getitem__(self, idx):
- # Use the stored offset to read a specific line
- self.file_handle.seek(self.offsets[idx])
- item = self.file_handle.readline()
- # Decode from JSON repr:
- # context, qas -> [{q,a}]
- item = json.loads(item)
- context = item['context']
- qas = item['qas']
- random.shuffle(qas)
- self.chat = ChatHistory()
- self.chat.add_message(role="Human", content=f"{context}")
- self.chat.add_message(role="Assistant", content=f"{item['summary']}\n\n{random.choice(self.REMARKS)}")
- for i, qa in enumerate(qas):
- if i > 4:
- break
- self.chat.add_message(role="Human", content=qa['q'])
- self.chat.add_message(role="Assistant", content=qa['a'])
- transcript = self.chat.transcript(roles=(not self.saturate))
- tokens = self.tokenizer.encode(transcript)
- # Truncate or pad to sequence length.
- if len(tokens) > self.max_seq_length:
- tokens = tokens[:self.max_seq_length]
- else:
- tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
- # Causal language modelling learns to associate current segment of text: "The quick brown fox",
- input_tokens = torch.tensor(tokens)
- # ...with the next segment of text: " quick brown fox".
- target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)
- return input_tokens, target_tokens
- # class JSONLStreamGenerateQA(JSONLStreamQA):
- # def __getitem__(self, idx):
- # # Use the stored offset to read a specific line
- # self.file_handle.seek(self.offsets[idx])
- # item = self.file_handle.readline()
- # # Decode from JSON repr:
- # # context, qas -> [{q,a}]
- # item = json.loads(item)
- # context = item['context']
- # qas = item['qas']
- # random.shuffle(qas)
- # self.chat = ChatHistory()
- # n = random.randint(3, 9)
- # t = "JSON array in the form of 'query'/'response'"
- # self.chat.add_message(role="Human", content=f"{context}\n---\nPlease generate a list of {n} questions from this information in the form of a {t}.")
- # gen = [{
- # 'query': qa['q'],
- # 'response': qa['a']
- # } for qa in qas[:n]]
- # resp = json.dumps(gen, indent=2)
- # self.chat.add_message(role="Assistant", content=f"Sure! Here's a list of {n} entries in the format requested:\n\n```json\n{resp}\n```\n\n{random.choice(self.REMARKS)}")
- # transcript = self.chat.transcript(roles=(not self.saturate))
- # tokens = self.tokenizer.encode(transcript)
- # # Truncate or pad to sequence length.
- # if len(tokens) > self.max_seq_length:
- # tokens = tokens[:self.max_seq_length]
- # else:
- # tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
- # # Causal language modelling learns to associate current segment of text: "The quick brown fox",
- # input_tokens = torch.tensor(tokens)
- # # ...with the next segment of text: " quick brown fox".
- # target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)
- # return input_tokens, target_tokens
- class JSONLStreamQASummary(JSONLStreamQA):
- def __getitem__(self, idx):
- # Use the stored offset to read a specific line
- self.file_handle.seek(self.offsets[idx])
- item = self.file_handle.readline()
- # Decode from JSON repr:
- # context, qas -> [{q,a}]
- item = json.loads(item)
- context = item['context']
- summary = item['summary']
- self.chat = ChatHistory()
- wc = len(summary.split(" "))
- key1 = random.choice(["context", "passage", "document", "extract", "text", "paragraphs", "input_document"])
- key2 = random.choice(["summary", "SUMMARISED", "summarised", "summarise", "summary1", "the_summary", "document_summarised", "summarised_document", "document_output", "output"])
- self.chat.add_message(role="Human", content=f"{context}\n---\nPlease summarise the document above in {wc} words. Show it in JSON with the keys {key1}, {key2}.")
- gen = {
- key1: context,
- key2: summary,
- "count": wc,
- }
- resp = json.dumps(gen, indent=4)
- self.chat.add_message(role="Assistant", content=f"```json\n{resp}\n```")
- transcript = self.chat.transcript(roles=(not self.saturate))
- tokens = self.tokenizer.encode(transcript)
- # Truncate or pad to sequence length.
- if len(tokens) > self.max_seq_length:
- tokens = tokens[:self.max_seq_length]
- else:
- # print(f"--- Tokens BEFORE PADDING: {len(tokens)} ---")
- # print(f"\n{'-' * 80}\n{tokens}\n{'-' * 80}\n")
- tokens += [self.tokenizer.pad_token] * (self.max_seq_length - len(tokens))
- # print(f"--- Tokens AFTER PADDING: {len(tokens)} ---")
- # print(f"\n{'-' * 80}\n{tokens}\n{'-' * 80}\n")
- # Causal language modelling learns to associate current segment of text: "The quick brown fox",
- input_tokens = torch.tensor(tokens)
- # ...with the next segment of text: " quick brown fox".
- target_tokens = torch.cat((input_tokens[1:self.max_seq_length], torch.tensor([self.tokenizer.pad_token])), dim=-1)
- return input_tokens, target_tokens
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement