Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Build your own Wikipedia RAG with LM Studio
- # ===========================================
- This guide shows how to turn a Wikipedia dump into a working RAG pipeline using:
- - Wikipedia dump → plain text
- - Chunk into passages
- - Embed with sentence-transformers
- - Store in Chroma vector DB
- - Query with retrieval + local generation (LM Studio)
- ---
- ## 1. Setup environment
- ```bash
- python -m venv venv
- source venv/bin/activate # on Windows: venv\Scripts\activate
- pip install --upgrade pip
- pip install wikiextractor sentence-transformers chromadb faiss-cpu nltk tqdm requests
- ---
- 2. Download Wikipedia dump
- mkdir wiki-data && cd wiki-data
- wget -c https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
- ---
- 3. Extract plain text
- python -m wikiextractor.WikiExtractor -o extracted enwiki-latest-pages-articles.xml.bz2
- This writes plain text into extracted/.
- ---
- 4. Chunk articles into passages
- Save as wiki_to_chunks.py:
- import os, re, json
- from pathlib import Path
- import nltk
- nltk.download('punkt')
- from nltk.tokenize import sent_tokenize
- from tqdm import tqdm
- EXTRACTED_DIR = "extracted"
- OUTFILE = "wiki_chunks.jsonl"
- MAX_CHARS = 1200
- def iter_doc_texts(extracted_dir):
- p = Path(extracted_dir)
- for file in p.rglob("*.txt"):
- s = file.read_text(encoding="utf-8", errors="ignore")
- for m in re.finditer(r'(<doc[^>]*>)(.*?)</doc>', s, flags=re.S):
- open_tag = m.group(1)
- body = m.group(2).strip()
- tmatch = re.search(r'title="([^"]+)"', open_tag)
- title = tmatch.group(1) if tmatch else "NO_TITLE"
- yield title, body
- def chunk_text(text, max_chars=MAX_CHARS):
- sents = sent_tokenize(text)
- chunks, cur, cur_len = [], [], 0
- for sent in sents:
- cur.append(sent)
- cur_len += len(sent)
- if cur_len >= max_chars:
- chunks.append(" ".join(cur))
- cur, cur_len = [], 0
- if cur:
- chunks.append(" ".join(cur))
- return chunks
- def main():
- outfh = open(OUTFILE, "w", encoding="utf-8")
- idx = 0
- for title, body in tqdm(iter_doc_texts(EXTRACTED_DIR)):
- if len(body) < 200:
- continue
- chunks = chunk_text(body)
- for i, c in enumerate(chunks):
- doc_id = f"doc{idx:09d}-{i}"
- rec = {"id": doc_id, "title": title, "text": c}
- outfh.write(json.dumps(rec, ensure_ascii=False) + "\n")
- idx += 1
- outfh.close()
- print("Wrote", OUTFILE)
- if __name__ == "__main__":
- main()
- Run:
- python wiki_to_chunks.py
- ---
- 5. Ingest into Chroma with embeddings
- Save as ingest_chroma.py:
- import json
- from tqdm import tqdm
- from sentence_transformers import SentenceTransformer
- import chromadb
- from chromadb.config import Settings
- MODEL_NAME = "all-MiniLM-L6-v2"
- CHUNKS_FILE = "wiki_chunks.jsonl"
- BATCH = 512
- PERSIST_DIR = "./chroma_db"
- print("Loading embedding model...")
- model = SentenceTransformer(MODEL_NAME)
- print("Setting up Chroma...")
- client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIR))
- collection = client.get_or_create_collection("wikipedia")
- docs, metas, ids = [], [], []
- with open(CHUNKS_FILE, "r", encoding="utf-8") as fh:
- for line in fh:
- j = json.loads(line)
- ids.append(j["id"])
- docs.append(j["text"])
- metas.append({"title": j["title"]})
- print("Total docs:", len(ids))
- for i in tqdm(range(0, len(docs), BATCH)):
- batch_docs = docs[i:i+BATCH]
- batch_ids = ids[i:i+BATCH]
- batch_metas = metas[i:i+BATCH]
- emb = model.encode(batch_docs, convert_to_numpy=True).tolist()
- collection.add(ids=batch_ids, documents=batch_docs, metadatas=batch_metas, embeddings=emb)
- client.persist()
- print("Ingest complete.")
- Run:
- python ingest_chroma.py
- ---
- 6. Query with RAG + LM Studio
- LM Studio runs a local server at http://localhost:1234/v1/chat/completions.
- Start LM Studio, load your model (e.g. Mistral, LLaMA), enable API server.
- Save as query_rag.py:
- import requests
- from sentence_transformers import SentenceTransformer
- import chromadb
- from chromadb.config import Settings
- LMSTUDIO_URL = "http://localhost:1234/v1/chat/completions"
- EMBED_MODEL = "all-MiniLM-L6-v2"
- CHROMA_DIR = "./chroma_db"
- TOP_K = 4
- emb_model = SentenceTransformer(EMBED_MODEL)
- client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=CHROMA_DIR))
- collection = client.get_collection("wikipedia")
- def retrieve(query, top_k=TOP_K):
- q_emb = emb_model.encode([query])[0].tolist()
- res = collection.query(query_embeddings=[q_emb], n_results=top_k)
- docs = res["documents"][0]
- metas = res["metadatas"][0]
- return list(zip(docs, metas))
- def build_prompt(query, docs):
- context = "\n\n---\n\n".join([f"Title: {m.get('title','')}\n\n{d}" for d,m in docs])
- return f"""Use the context below (Wikipedia) to answer.
- If the answer is not in the context, say you don't know.
- CONTEXT:
- {context}
- QUESTION: {query}
- """
- def generate_lmstudio(prompt):
- headers = {"Content-Type": "application/json"}
- payload = {
- "model": "local-model", # LM Studio ignores name, uses loaded model
- "messages": [{"role":"user","content":prompt}],
- "temperature": 0,
- "max_tokens": 400
- }
- r = requests.post(LMSTUDIO_URL, headers=headers, json=payload)
- r.raise_for_status()
- data = r.json()
- return data["choices"][0]["message"]["content"]
- def answer(query):
- docs = retrieve(query)
- prompt = build_prompt(query, docs)
- return generate_lmstudio(prompt)
- if __name__ == "__main__":
- q = input("Question: ")
- print(answer(q))
- Run:
- python query_rag.py
- # Example: Who was Ada Lovelace?
- ---
- 7. Quick checklist
- 1) wget -c https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
- 2) python -m wikiextractor.WikiExtractor -o extracted enwiki-latest-pages-articles.xml.bz2
- 3) python wiki_to_chunks.py
- 4) python ingest_chroma.py
- 5) Start LM Studio (load model, enable API server)
- 6) python query_rag.py
- ---
- That’s it 🚀
- You now have a Wikipedia → RAG pipeline running with LM Studio locally.
Advertisement
Add Comment
Please, Sign In to add comment