Guest User

Wikipedia rag

a guest
Sep 8th, 2025
708
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.23 KB | Science | 0 0
  1. # Build your own Wikipedia RAG with LM Studio
  2. # ===========================================
  3.  
  4. This guide shows how to turn a Wikipedia dump into a working RAG pipeline using:
  5. - Wikipedia dump → plain text
  6. - Chunk into passages
  7. - Embed with sentence-transformers
  8. - Store in Chroma vector DB
  9. - Query with retrieval + local generation (LM Studio)
  10.  
  11. ---
  12.  
  13. ## 1. Setup environment
  14.  
  15. ```bash
  16. python -m venv venv
  17. source venv/bin/activate   # on Windows: venv\Scripts\activate
  18. pip install --upgrade pip
  19.  
  20. pip install wikiextractor sentence-transformers chromadb faiss-cpu nltk tqdm requests
  21.  
  22.  
  23. ---
  24.  
  25. 2. Download Wikipedia dump
  26.  
  27. mkdir wiki-data && cd wiki-data
  28. wget -c https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
  29.  
  30.  
  31. ---
  32.  
  33. 3. Extract plain text
  34.  
  35. python -m wikiextractor.WikiExtractor -o extracted enwiki-latest-pages-articles.xml.bz2
  36.  
  37. This writes plain text into extracted/.
  38.  
  39.  
  40. ---
  41.  
  42. 4. Chunk articles into passages
  43.  
  44. Save as wiki_to_chunks.py:
  45.  
  46. import os, re, json
  47. from pathlib import Path
  48. import nltk
  49. nltk.download('punkt')
  50. from nltk.tokenize import sent_tokenize
  51. from tqdm import tqdm
  52.  
  53. EXTRACTED_DIR = "extracted"
  54. OUTFILE = "wiki_chunks.jsonl"
  55. MAX_CHARS = 1200
  56.  
  57. def iter_doc_texts(extracted_dir):
  58.     p = Path(extracted_dir)
  59.     for file in p.rglob("*.txt"):
  60.         s = file.read_text(encoding="utf-8", errors="ignore")
  61.         for m in re.finditer(r'(<doc[^>]*>)(.*?)</doc>', s, flags=re.S):
  62.             open_tag = m.group(1)
  63.             body = m.group(2).strip()
  64.             tmatch = re.search(r'title="([^"]+)"', open_tag)
  65.             title = tmatch.group(1) if tmatch else "NO_TITLE"
  66.             yield title, body
  67.  
  68. def chunk_text(text, max_chars=MAX_CHARS):
  69.     sents = sent_tokenize(text)
  70.     chunks, cur, cur_len = [], [], 0
  71.     for sent in sents:
  72.         cur.append(sent)
  73.         cur_len += len(sent)
  74.         if cur_len >= max_chars:
  75.             chunks.append(" ".join(cur))
  76.             cur, cur_len = [], 0
  77.     if cur:
  78.         chunks.append(" ".join(cur))
  79.     return chunks
  80.  
  81. def main():
  82.     outfh = open(OUTFILE, "w", encoding="utf-8")
  83.     idx = 0
  84.     for title, body in tqdm(iter_doc_texts(EXTRACTED_DIR)):
  85.         if len(body) < 200:
  86.             continue
  87.         chunks = chunk_text(body)
  88.         for i, c in enumerate(chunks):
  89.             doc_id = f"doc{idx:09d}-{i}"
  90.             rec = {"id": doc_id, "title": title, "text": c}
  91.             outfh.write(json.dumps(rec, ensure_ascii=False) + "\n")
  92.         idx += 1
  93.     outfh.close()
  94.     print("Wrote", OUTFILE)
  95.  
  96. if __name__ == "__main__":
  97.     main()
  98.  
  99. Run:
  100.  
  101. python wiki_to_chunks.py
  102.  
  103.  
  104. ---
  105.  
  106. 5. Ingest into Chroma with embeddings
  107.  
  108. Save as ingest_chroma.py:
  109.  
  110. import json
  111. from tqdm import tqdm
  112. from sentence_transformers import SentenceTransformer
  113. import chromadb
  114. from chromadb.config import Settings
  115.  
  116. MODEL_NAME = "all-MiniLM-L6-v2"
  117. CHUNKS_FILE = "wiki_chunks.jsonl"
  118. BATCH = 512
  119. PERSIST_DIR = "./chroma_db"
  120.  
  121. print("Loading embedding model...")
  122. model = SentenceTransformer(MODEL_NAME)
  123.  
  124. print("Setting up Chroma...")
  125. client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIR))
  126. collection = client.get_or_create_collection("wikipedia")
  127.  
  128. docs, metas, ids = [], [], []
  129. with open(CHUNKS_FILE, "r", encoding="utf-8") as fh:
  130.     for line in fh:
  131.         j = json.loads(line)
  132.         ids.append(j["id"])
  133.         docs.append(j["text"])
  134.         metas.append({"title": j["title"]})
  135.  
  136. print("Total docs:", len(ids))
  137.  
  138. for i in tqdm(range(0, len(docs), BATCH)):
  139.     batch_docs = docs[i:i+BATCH]
  140.     batch_ids = ids[i:i+BATCH]
  141.     batch_metas = metas[i:i+BATCH]
  142.     emb = model.encode(batch_docs, convert_to_numpy=True).tolist()
  143.     collection.add(ids=batch_ids, documents=batch_docs, metadatas=batch_metas, embeddings=emb)
  144.  
  145. client.persist()
  146. print("Ingest complete.")
  147.  
  148. Run:
  149.  
  150. python ingest_chroma.py
  151.  
  152.  
  153. ---
  154.  
  155. 6. Query with RAG + LM Studio
  156.  
  157. LM Studio runs a local server at http://localhost:1234/v1/chat/completions.
  158. Start LM Studio, load your model (e.g. Mistral, LLaMA), enable API server.
  159.  
  160. Save as query_rag.py:
  161.  
  162. import requests
  163. from sentence_transformers import SentenceTransformer
  164. import chromadb
  165. from chromadb.config import Settings
  166.  
  167. LMSTUDIO_URL = "http://localhost:1234/v1/chat/completions"
  168.  
  169. EMBED_MODEL = "all-MiniLM-L6-v2"
  170. CHROMA_DIR = "./chroma_db"
  171. TOP_K = 4
  172.  
  173. emb_model = SentenceTransformer(EMBED_MODEL)
  174. client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=CHROMA_DIR))
  175. collection = client.get_collection("wikipedia")
  176.  
  177. def retrieve(query, top_k=TOP_K):
  178.     q_emb = emb_model.encode([query])[0].tolist()
  179.     res = collection.query(query_embeddings=[q_emb], n_results=top_k)
  180.     docs = res["documents"][0]
  181.     metas = res["metadatas"][0]
  182.     return list(zip(docs, metas))
  183.  
  184. def build_prompt(query, docs):
  185.     context = "\n\n---\n\n".join([f"Title: {m.get('title','')}\n\n{d}" for d,m in docs])
  186.     return f"""Use the context below (Wikipedia) to answer.
  187. If the answer is not in the context, say you don't know.
  188.  
  189. CONTEXT:
  190. {context}
  191.  
  192. QUESTION: {query}
  193. """
  194.  
  195. def generate_lmstudio(prompt):
  196.     headers = {"Content-Type": "application/json"}
  197.     payload = {
  198.         "model": "local-model",  # LM Studio ignores name, uses loaded model
  199.         "messages": [{"role":"user","content":prompt}],
  200.         "temperature": 0,
  201.         "max_tokens": 400
  202.     }
  203.     r = requests.post(LMSTUDIO_URL, headers=headers, json=payload)
  204.     r.raise_for_status()
  205.     data = r.json()
  206.     return data["choices"][0]["message"]["content"]
  207.  
  208. def answer(query):
  209.     docs = retrieve(query)
  210.     prompt = build_prompt(query, docs)
  211.     return generate_lmstudio(prompt)
  212.  
  213. if __name__ == "__main__":
  214.     q = input("Question: ")
  215.     print(answer(q))
  216.  
  217. Run:
  218.  
  219. python query_rag.py
  220. # Example: Who was Ada Lovelace?
  221.  
  222.  
  223. ---
  224.  
  225. 7. Quick checklist
  226.  
  227. 1) wget -c https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
  228. 2) python -m wikiextractor.WikiExtractor -o extracted enwiki-latest-pages-articles.xml.bz2
  229. 3) python wiki_to_chunks.py
  230. 4) python ingest_chroma.py
  231. 5) Start LM Studio (load model, enable API server)
  232. 6) python query_rag.py
  233.  
  234.  
  235. ---
  236.  
  237. That’s it 🚀
  238. You now have a Wikipedia → RAG pipeline running with LM Studio locally.
  239.  
  240.  
  241.  
  242.  
Advertisement
Add Comment
Please, Sign In to add comment