sentencepiece_tokenizer_trainer

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Filename: sentencepiece_tokenizer_trainer.py
# Version: 1.0.0
# Author: Jeoi Reqi

"""
This script trains a SentencePiece tokenizer model based on provided or user-selected text data.
It offers options to use sample text data provided within the script or to select a custom text file.
The trained model is saved with the prefix 'tokenizer', and vocabulary files are generated accordingly.

Requirements:

    - Python 3.x
    - sentencepiece library
    - tkinter

Functions:

    - use_sample_text(): Generates sample text data and saves it to a file.
    - use_custom_text(): Allows the user to choose a custom text file using a file dialog.
    - main(): Provides options to choose sample or custom text, trains the model & generates vocabulary files.

Usage:

    1. Run the script using Python 3.x interpreter.
    2. Choose an option to use sample text data or select a custom text file.
    3. The script will train a SentencePiece tokenizer model based on the chosen text data.
    4. Tokenization model and vocabulary files will be generated with the prefix 'tokenizer'.

Additional Notes:

    - The SentencePiece tokenizer model is trained with adjusted parameters such as vocabulary size and character coverage.
    - It's recommended to use diverse text sources for training to capture a wide range of linguistic nuances.
    - Ensure that the sentencepiece library is installed in the Python environment before running the script.
    - This script provides a convenient way to train a tokenizer model for text preprocessing tasks.
"""

import os
import sentencepiece as spm
from tkinter import Tk, filedialog

def use_sample_text():
    # Sample text data
    text_data = """
    Welcome to the sample text corpus! This corpus is designed to showcase the capabilities of the tokenizer model. It consists of several sentences carefully crafted to cover a wide range of linguistic patterns and vocabulary.
    In this corpus, you'll find sentences of varying lengths and complexities. Some sentences are short and simple, while others are longer and more intricate. This diversity helps the tokenizer model learn to handle different types of text inputs effectively.
    The purpose of this sample text is to provide a starting point for training the tokenizer model. However, it's essential to note that real-world text data will vary significantly from this example. Therefore, it's highly recommended to replace this sample text with your own data for more accurate and relevant model training.
    When replacing this sample text with your own data, consider using a diverse set of text sources. Include text from different domains, genres, and languages to ensure that the tokenizer model captures a wide range of linguistic nuances.
    Remember, the quality of the tokenizer model depends largely on the quality and diversity of the training data. So, take your time to gather and curate a comprehensive dataset that reflects the text inputs your model will encounter in real-world applications.
    Thank you for using this sample text corpus. We wish you the best of luck in training your tokenizer model!
    """
    with open("text_data.txt", "w", encoding="utf-8") as f:
        f.write(text_data)

    return "text_data.txt"

# Function to open the file explorer to select a custom text file to use
def use_custom_text():
    root = Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    return file_path

def main():
    print("Menu Options:\n")
    print("1. Use sample text data")
    print("2. Choose a custom text file")
    choice = input("\nEnter your choice (1 or 2): ")

    if choice == "1":
        print("\n\t\tTraining with sample data...\n")
        text_file_path = use_sample_text()
    elif choice == "2":
        print("\n\t\tTraining with custom data...\n")
        text_file_path = use_custom_text()
    else:
        print("\n\t\tInvalid choice. Please enter '1' or '2'.\n")
        return

    if not text_file_path:
        print("\n\t\tNo text file selected. Exiting...\n")
        return

    # Train SentencePiece model with adjusted parameters
    spm.SentencePieceTrainer.train(
        input=text_file_path,      # Use the selected text data file as input
        model_prefix="tokenizer",  # Set the prefix for model and vocabulary files
        vocab_size=189,            # Edit the vocabulary size to accommodate all required characters
        character_coverage=0.9995, # Keep the character coverage as specified
        num_threads=16,            # Use 16 threads for training
    )

    # Load the trained model
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load("tokenizer.model")

    # Print the vocabulary size
    print(
        f"\n\t\tTRAINING COMPLETE!\n\t\t--------------------------------------\n\t\tVocabulary size: {tokenizer.vocab_size()}"
    )

    # Generate vocabulary file
    with open("vocab.txt", "w", encoding="utf-8") as vocab_file:
        for vocab_id in range(tokenizer.vocab_size()):
            vocab_file.write(f"{vocab_id:>30} {tokenizer.id_to_piece(vocab_id)}")

    print("\t\t--------------------------------------\n\t\tFiles Created:\n")
    print(f"\t\t- tokenizer.model\n\t\tPath: {os.path.abspath('tokenizer.model')}\n")
    print(f"\t\t- tokenizer.vocab\n\t\tPath: {os.path.abspath('tokenizer.vocab')}\n")
    print(
        f"\t\t- vocab.txt\n\t\tPath: {os.path.abspath('vocab.txt')}\n\t\t--------------------------------------"
    )
    print(
        "\t\tExiting program...\tGoodBye!\n\t\t--------------------------------------"
    )

if __name__ == "__main__":
    main()