Guest User

rk3588 rag 2025

a guest
Jan 5th, 2025
142
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 17.14 KB | None | 0 0
  1. /tmp folder of rk3588 seems to be to small - models need to be downloaded to avoid insufficent space int /tmp folder
  2.  
  3. Create "models" folder - and run inside:
  4. git clone https://huggingface.co/BAAI/bge-m3
  5. git clone https://huggingface.co/BAAI/bge-reranker-large
  6.  
  7. Then first vector.py file to create index (some pip packages will be missing, chatgpt will help). Important setting is "Settings.chunk_size = 512" which limits a size of each chunk. If we have to work with 4096 limit - 7 chunks of 512 seems to be within this limit. Reranker might be necessary.
  8.  
  9. [vector.py]:
  10. from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
  11. from llama_index.embeddings.huggingface import HuggingFaceEmbedding
  12.  
  13. Settings.chunk_size = 512
  14.  
  15. # My local documents
  16. documents = SimpleDirectoryReader("data", recursive=True).load_data()
  17.  
  18. # Embeddings model
  19. Settings.embed_model = HuggingFaceEmbedding(model_name="./models/bge-m3")
  20.  
  21. # Create index
  22. index = VectorStoreIndex.from_documents(documents)
  23. index.storage_context.persist(persist_dir="./vector")
  24.  
  25. [/vector.py]
  26.  
  27. Then modified rkllm-gradio with llama-index and reranker. Important settings are, first - limit of chunks will be chosen by bge-m3 for the context. This is not limited by 4096 size, so it might be larger, for example 20 chunks of 512 size:
  28. rerank = FlagEmbeddingReranker(model="./models/bge-reranker-large", top_n=20)
  29.  
  30.  
  31. Second - limit of chunks to be chosen by reranker to provide to Rockchip NPU, these have to be 7 (with chunk size of 512) to meet the 4096 limit:
  32. "query_engine = index.as_query_engine(similarity_top_k=7, node_postprocessors=[rerank])"
  33.  
  34.  
  35. [rkllm-gradio.py]
  36. import sys
  37. import resource
  38. import gradio as gr
  39. from ctypes_bindings import *
  40. from model_class import *
  41. from mesh_utils import *
  42. from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext, load_index_from_storage
  43. from llama_index.embeddings.huggingface import HuggingFaceEmbedding
  44. from llama_index.postprocessor.flag_embedding_reranker import (
  45. FlagEmbeddingReranker,
  46. )
  47.  
  48. rerank = FlagEmbeddingReranker(model="./models/bge-reranker-large", top_n=20)
  49.  
  50. # Set environment variables
  51. os.environ["OPENAI_API_KEY"] = "fake"
  52. os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
  53. os.environ["GRADIO_SERVER_PORT"] = "8080"
  54. os.environ["RKLLM_LOG_LEVEL"] = "1"
  55.  
  56. Settings.embed_model = HuggingFaceEmbedding(model_name="./models/bge-m3")
  57. Settings.llm = None
  58.  
  59. # Set resource limit
  60. resource.setrlimit(resource.RLIMIT_NOFILE, (102400, 102400))
  61.  
  62. history = []
  63.  
  64. if __name__ == "__main__":
  65. # Helper function to define initializing model before class is declared
  66. # Without this, you would need to initialize the class before you select the model
  67. def initialize_model(model):
  68. global rkllm_model
  69. # Have to unload previous model in single-threaded mode
  70. try:
  71. rkllm_model.release()
  72. except:
  73. print("No model loaded! Continuing with initialization...")
  74. # Initialize RKLLM model
  75. init_msg = "=========INITIALIZING==========="
  76. print(init_msg)
  77. sys.stdout.flush()
  78. rkllm_model = RKLLMLoaderClass(model=model)
  79. model_init = f"RKLLM Model, {rkllm_model.model_name} has been initialized successfully!"
  80. print(model_init)
  81. complete_init = "=============================="
  82. print(complete_init)
  83. output = [[f"<h4 style=\"text-align:center;\">{model_init}\n</h4>", None]]
  84. sys.stdout.flush()
  85. return output
  86. # Helper function to stream LLM output into the chat box
  87. def get_RKLLM_output(message, history):
  88.  
  89. # Ensure `message` is a dictionary with a `text` key
  90. if isinstance(message, str):
  91. message = {"text": message}
  92. elif not isinstance(message, dict) or "text" not in message:
  93. raise ValueError("Invalid message format. Expected a dictionary with a 'text' key.")
  94.  
  95. # Create storage context from persisted data
  96. storage_context = StorageContext.from_defaults(persist_dir="./vector")
  97.  
  98. # Load index from storage context
  99. index = load_index_from_storage(storage_context)
  100.  
  101. # Query the index for context
  102. query = ""
  103. query_engine = index.as_query_engine(similarity_top_k=7, node_postprocessors=[rerank])
  104. context = str(query_engine.query(message["text"]))
  105. print(message)
  106. print(context)
  107. message_with_context = f"{context}\n{message}"
  108. try:
  109. yield from rkllm_model.get_RKLLM_output(message_with_context, history)
  110. except RuntimeError as e:
  111. print(f"ERROR: {e}")
  112. return history
  113.  
  114. # Create a Gradio interface
  115. with gr.Blocks(title="Chat with RKLLM") as chatRKLLM:
  116. available_models = available_models()
  117. gr.Markdown("<div align='center'><font size='10'> Rockchip NPU </font></div>")
  118. with gr.Tabs():
  119. with gr.TabItem("Select Model"):
  120. model_dropdown = gr.Dropdown(choices=available_models, label="Select Model", value="None", allow_custom_value=True)
  121. statusBox = gr.Chatbot(height=100)
  122. model_dropdown.input(initialize_model, [model_dropdown], [statusBox])
  123. with gr.TabItem("Txt2Txt"):
  124. txt2txt = gr.ChatInterface(fn=get_RKLLM_output, type="messages")
  125. with gr.TabItem("Txt2Mesh"):
  126. with gr.Row():
  127. with gr.Column(scale=2):
  128. txt2txt = gr.ChatInterface(fn=get_RKLLM_output, type="messages")
  129. with gr.Column(scale=2):
  130. # Add the text box for 3D mesh input and button
  131. mesh_input = gr.Textbox(
  132. label="3D Mesh Input",
  133. placeholder="Paste your 3D mesh in OBJ format here...",
  134. lines=5,
  135. )
  136. visualize_button = gr.Button("Visualize 3D Mesh")
  137. output_model = gr.Model3D(
  138. label="3D Mesh Visualization",
  139. interactive=False,
  140. )
  141. # Link the button to the visualization function
  142. visualize_button.click(
  143. fn=apply_gradient_color,
  144. inputs=[mesh_input],
  145. outputs=[output_model]
  146. )
  147. print("\nNo model loaded yet!\n")
  148.  
  149.  
  150. # Enable the event queue system.
  151. chatRKLLM.queue()
  152. # Start the Gradio application.
  153. chatRKLLM.launch()
  154.  
  155. print("====================")
  156. print("RKLLM model inference completed, releasing RKLLM model resources...")
  157. rkllm_model.release()
  158. print("====================")
  159.  
  160. [/rkllm-gradio.py]
  161. ~
  162. ~
  163. ~
  164. ~
  165. ~
  166. ~
  167. ~
  168. ~
  169. ~
  170. ~
  171. ~
Advertisement
Add Comment
Please, Sign In to add comment