rk3588 rag 2025

/tmp folder of rk3588 seems to be to small - models need to be downloaded to avoid insufficent space int /tmp folder

Create "models" folder - and run inside:
git clone https://huggingface.co/BAAI/bge-m3
git clone https://huggingface.co/BAAI/bge-reranker-large

Then first vector.py file to create index (some pip packages will be missing, chatgpt will help). Important setting is "Settings.chunk_size = 512" which limits a size of each chunk. If we have to work with 4096 limit - 7 chunks of 512 seems to be within this limit. Reranker might be necessary.

[vector.py]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.chunk_size = 512

# My local documents
documents = SimpleDirectoryReader("data", recursive=True).load_data()

# Embeddings model
Settings.embed_model = HuggingFaceEmbedding(model_name="./models/bge-m3")

# Create index
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir="./vector")

 [/vector.py]

Then modified rkllm-gradio with llama-index and reranker. Important settings are, first - limit of chunks will be chosen by bge-m3 for the context. This is not limited by 4096 size, so it might be larger, for example 20 chunks of 512 size:
rerank = FlagEmbeddingReranker(model="./models/bge-reranker-large", top_n=20)


Second - limit of chunks to be chosen by reranker to provide to Rockchip NPU, these have to be 7 (with chunk size of 512) to meet the 4096 limit:
"query_engine = index.as_query_engine(similarity_top_k=7, node_postprocessors=[rerank])"


[rkllm-gradio.py]
import sys
import resource
import gradio as gr
from ctypes_bindings import *
from model_class import *
from mesh_utils import *
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.postprocessor.flag_embedding_reranker import (
            FlagEmbeddingReranker,
            )

rerank = FlagEmbeddingReranker(model="./models/bge-reranker-large", top_n=20)

# Set environment variables
os.environ["OPENAI_API_KEY"] = "fake"
os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
os.environ["GRADIO_SERVER_PORT"] = "8080"
os.environ["RKLLM_LOG_LEVEL"] = "1"

Settings.embed_model = HuggingFaceEmbedding(model_name="./models/bge-m3")
Settings.llm = None

# Set resource limit
resource.setrlimit(resource.RLIMIT_NOFILE, (102400, 102400))

history = []

if __name__ == "__main__":
    # Helper function to define initializing model before class is declared
    # Without this, you would need to initialize the class before you select the model
    def initialize_model(model):
        global rkllm_model
        # Have to unload previous model in single-threaded mode
        try:
            rkllm_model.release()
        except:
            print("No model loaded! Continuing with initialization...")
        # Initialize RKLLM model
        init_msg = "=========INITIALIZING==========="
        print(init_msg)
        sys.stdout.flush()
        rkllm_model = RKLLMLoaderClass(model=model)
        model_init = f"RKLLM Model, {rkllm_model.model_name} has been initialized successfully！"
        print(model_init)
        complete_init = "=============================="
        print(complete_init)
        output = [[f"<h4 style=\"text-align:center;\">{model_init}\n</h4>", None]]
        sys.stdout.flush()
        return output
    # Helper function to stream LLM output into the chat box
    def get_RKLLM_output(message, history):

        # Ensure `message` is a dictionary with a `text` key
        if isinstance(message, str):
            message = {"text": message}
        elif not isinstance(message, dict) or "text" not in message:
            raise ValueError("Invalid message format. Expected a dictionary with a 'text' key.")

        # Create storage context from persisted data
        storage_context = StorageContext.from_defaults(persist_dir="./vector")

        # Load index from storage context
        index = load_index_from_storage(storage_context)

        # Query the index for context
        query = ""
        query_engine = index.as_query_engine(similarity_top_k=7, node_postprocessors=[rerank])
        context = str(query_engine.query(message["text"]))
        print(message)
        print(context)
        message_with_context = f"{context}\n{message}"
        try:
            yield from rkllm_model.get_RKLLM_output(message_with_context, history)
        except RuntimeError as e:
            print(f"ERROR: {e}")
        return history

    # Create a Gradio interface
    with gr.Blocks(title="Chat with RKLLM") as chatRKLLM:
        available_models = available_models()
        gr.Markdown("<div align='center'><font size='10'> Rockchip NPU </font></div>")
        with gr.Tabs():
            with gr.TabItem("Select Model"):
                model_dropdown = gr.Dropdown(choices=available_models, label="Select Model", value="None", allow_custom_value=True)
                statusBox = gr.Chatbot(height=100)
                model_dropdown.input(initialize_model, [model_dropdown], [statusBox])
            with gr.TabItem("Txt2Txt"):
                txt2txt = gr.ChatInterface(fn=get_RKLLM_output, type="messages")
            with gr.TabItem("Txt2Mesh"):
                with gr.Row():
                    with gr.Column(scale=2):
                        txt2txt = gr.ChatInterface(fn=get_RKLLM_output, type="messages")
                    with gr.Column(scale=2):
                        # Add the text box for 3D mesh input and button
                        mesh_input = gr.Textbox(
                            label="3D Mesh Input",
                            placeholder="Paste your 3D mesh in OBJ format here...",
                            lines=5,
                        )
                        visualize_button = gr.Button("Visualize 3D Mesh")
                        output_model = gr.Model3D(
                                    label="3D Mesh Visualization",
                                    interactive=False,
                                )
                        # Link the button to the visualization function
                        visualize_button.click(
                            fn=apply_gradient_color,
                            inputs=[mesh_input],
                            outputs=[output_model]
                            )
        print("\nNo model loaded yet!\n")


    # Enable the event queue system.
    chatRKLLM.queue()
    # Start the Gradio application.
    chatRKLLM.launch()

    print("====================")
    print("RKLLM model inference completed, releasing RKLLM model resources...")
    rkllm_model.release()
    print("====================")

[/rkllm-gradio.py]
~
~
~
~
~
~
~
~
~
~
~