Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- """
- voc.py
- SQLite -> Excel exporter for your vocabulary DB with Wiktionary definitions.
- Generates:
- - vocabulary_cleaned.xlsx (with 2 sheets: full data + Quizlet format)
- - voc.log (detailed run log)
- - definitions_cache.json (cached definitions)
- """
- import sqlite3
- import pandas as pd
- import requests
- import time
- import logging
- import json
- import os
- import re
- from typing import Optional
- # ============= CONFIG =============
- DB_PATH = "vocabulary_builder.sqlite3"
- OUTPUT_EXCEL = "vocabulary_cleaned.xlsx"
- LOG_FILE = "voc.log"
- CACHE_FILE = "definitions_cache.json"
- # MediaWiki API endpoint
- MW_API = "https://en.wiktionary.org/w/api.php"
- # Set a clear custom user agent per Wikimedia etiquette.
- # Replace [email protected] with your real contact if you plan high-volume use.
- USER_AGENT = "KoreaderVocabBot/1.0 (contact: [email protected])"
- # Sleep time between API calls (seconds). Increase if you see rate limiting.
- API_SLEEP = 0.3
- # ============= logging setup =============
- logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s %(levelname)s: %(message)s",
- handlers=[
- logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8"),
- logging.StreamHandler()
- ]
- )
- log = logging.getLogger("voc")
- # ============= cache helpers =============
- def load_cache():
- if os.path.exists(CACHE_FILE):
- try:
- with open(CACHE_FILE, "r", encoding="utf-8") as f:
- return json.load(f)
- except Exception as e:
- log.warning("Failed to read cache file: %s", e)
- return {}
- def save_cache(cache):
- try:
- with open(CACHE_FILE, "w", encoding="utf-8") as f:
- json.dump(cache, f, ensure_ascii=False, indent=2)
- log.info("Saved cache (%d entries) to %s", len(cache), CACHE_FILE)
- except Exception as e:
- log.error("Failed to write cache file: %s", e)
- # ============= Definition cleaning =============
- def clean_definition(text: str) -> str:
- """Remove all Wiki markup and formatting to get clean, readable text."""
- if not text:
- return ""
- # Remove {{lb|en|...}} language/context labels (like transitive, legal, etc.)
- text = re.sub(r'\{\{lb\|[^}]+\}\}', '', text)
- # Remove {{defdate|...}} date annotations
- text = re.sub(r'\{\{defdate\|[^}]+\}\}', '', text)
- # Remove {{R:...}} reference templates
- text = re.sub(r'\{\{R:[^}]+\}\}', '', text)
- # Remove {{ISO 639|...}} and similar templates
- text = re.sub(r'\{\{[Ii][Ss][Oo][^}]+\}\}', '', text)
- # Remove any remaining {{template}} tags
- text = re.sub(r'\{\{[^}]+\}\}', '', text)
- # Remove <ref>...</ref> tags and their content
- text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)
- # Remove <sup>, </sup> and other HTML tags
- text = re.sub(r'<[^>]+>', '', text)
- # Convert [[link#anchor|display]] to just display text
- text = re.sub(r'\[\[([^|\]]+#[^|\]]+\|)?([^\]]+)\]\]', r'\2', text)
- # Convert [[link|display]] to just display text
- text = re.sub(r'\[\[([^|\]]+\|)?([^\]]+)\]\]', r'\2', text)
- # Remove any remaining [[ or ]]
- text = text.replace('[[', '').replace(']]', '')
- # Remove bold/italic markup
- text = text.replace("'''", "").replace("''", "")
- # Clean up multiple spaces
- text = re.sub(r'\s+', ' ', text)
- # Remove leading/trailing whitespace and punctuation artifacts
- text = text.strip()
- text = text.strip(';.,')
- # Remove standalone periods or semicolons at the end
- text = re.sub(r'[;,]\s*$', '', text)
- return text
- # ============= Wiktionary lookup (robust) =============
- def parse_first_definition_from_wikitext(wikitext: str) -> str:
- """Extract the first sense line starting with '#' from raw wikitext.
- This is a pragmatic parser; Wiktionary pages are not strictly formatted,
- so this attempts to pick a sensible first definition.
- """
- if not wikitext:
- return ""
- lines = wikitext.splitlines()
- # Seek lines beginning with exactly '#' possibly preceded by ':*' or multiple '#' chars
- for line in lines:
- line_stripped = line.lstrip()
- if line_stripped.startswith("#") and not line_stripped.startswith("##"):
- # remove leading '#' and any ":" or whitespace
- d = line_stripped.lstrip("#: ").strip()
- # Clean the definition of all wiki markup
- d = clean_definition(d)
- # if definition is empty after cleanup, continue searching
- if d and len(d) > 3: # Avoid very short fragments
- return d
- return ""
- def get_wiktionary_definition(word: str, session: requests.Session, cache: dict) -> str:
- """Return first definition for word. Uses cache. Logs detailed HTTP info."""
- if not word:
- return ""
- word_key = word.strip()
- if word_key in cache:
- log.debug("cache hit: %s", word_key)
- return cache[word_key]
- params = {
- "action": "query",
- "format": "json",
- "prop": "revisions",
- "rvprop": "content",
- "titles": word_key,
- "redirects": 1
- }
- headers = {"User-Agent": USER_AGENT}
- # Try up to 3 times with backoff
- for attempt in range(1, 4):
- try:
- log.info("GET %s (attempt %d) for word=%s", MW_API, attempt, word_key)
- r = session.get(MW_API, params=params, headers=headers, timeout=10)
- log.info("HTTP %s for %s (len=%d)", r.status_code, word_key, len(r.content) if r.content else 0)
- # Log small response snippet for debugging (first 300 chars)
- snippet = r.text[:300].replace("\n", " ")
- log.debug("Response snippet: %s", snippet)
- if r.status_code == 200:
- data = r.json()
- pages = data.get("query", {}).get("pages", {})
- if not pages:
- log.warning("No pages found for %s", word_key)
- result = ""
- else:
- page = next(iter(pages.values()))
- # page may have 'revisions' with content under '*'
- revs = page.get("revisions")
- if revs and isinstance(revs, list) and len(revs) > 0:
- content = revs[0].get("*", "")
- # Parse first definition
- definition = parse_first_definition_from_wikitext(content)
- if definition:
- result = definition
- else:
- # fallback: try extracts (plain text)
- log.debug("No # line found in wikitext for %s; trying extracts fallback", word_key)
- # second API call for extracts
- params2 = {
- "action": "query",
- "format": "json",
- "prop": "extracts",
- "titles": word_key,
- "explaintext": True,
- "redirects": 1
- }
- r2 = session.get(MW_API, params=params2, headers=headers, timeout=10)
- log.info("Fallback extracts HTTP %s for %s", r2.status_code, word_key)
- text = r2.json().get("query", {}).get("pages", {})
- page2 = next(iter(text.values()))
- extract = page2.get("extract", "")
- # try to find a definition line starting with '#'
- result = ""
- if extract:
- for ll in extract.splitlines():
- if ll.strip().startswith("#"):
- cleaned = clean_definition(ll.strip().lstrip("#: ").strip())
- if cleaned and len(cleaned) > 3:
- result = cleaned
- break
- # last resort: use first non-empty paragraph
- if not result:
- paragraphs = [p.strip() for p in extract.split("\n\n") if p.strip()]
- result = clean_definition(paragraphs[0]) if paragraphs else ""
- else:
- log.warning("Page has no revisions for %s", word_key)
- result = ""
- # store in cache and return
- cache[word_key] = result
- # be nice to the API
- time.sleep(API_SLEEP)
- return result
- elif r.status_code in (429, 503):
- wait = 2 ** attempt
- log.warning("Rate-limited or server busy (HTTP %s). Backing off %ds", r.status_code, wait)
- time.sleep(wait)
- continue
- else:
- log.warning("Unexpected HTTP %s for %s", r.status_code, word_key)
- break
- except requests.RequestException as e:
- log.exception("Request error for %s: %s", word_key, e)
- time.sleep(1 + attempt)
- continue
- # on failure return empty
- cache[word_key] = ""
- return ""
- # ============= Context formatting =============
- def add_bold_to_word_in_context(context: str, word: str) -> str:
- """Add **word** formatting around the word in the context."""
- if not context or not word:
- return context
- # Escape special regex characters in word
- word_escaped = re.escape(word)
- # Case-insensitive replacement with word boundaries
- # This handles the word appearing with different cases
- pattern = r'\b(' + word_escaped + r')\b'
- result = re.sub(pattern, r'**\1**', context, count=1, flags=re.IGNORECASE)
- return result
- # ============= Interactive title selection =============
- def select_titles(df_titles):
- """Display available titles and let user select which ones to include."""
- print("\n" + "="*60)
- print("AVAILABLE BOOKS IN DATABASE")
- print("="*60)
- # Display all titles with their IDs
- for _, row in df_titles.iterrows():
- title_id = row['title_id']
- book_title = row['book_title']
- print(f"ID {title_id:3d}: {book_title}")
- print("="*60)
- print("\nEnter the book IDs you want to export.")
- print("Examples:")
- print(" - Single book: 5")
- print(" - Multiple books: 1,3,5")
- print(" - Range: 1-5")
- print(" - Mixed: 1,3,5-8,10")
- print(" - All books: all")
- print()
- while True:
- user_input = input("Your selection: ").strip().lower()
- if user_input == "all":
- return df_titles['title_id'].tolist()
- try:
- selected_ids = []
- parts = user_input.split(',')
- for part in parts:
- part = part.strip()
- if '-' in part:
- # Handle range (e.g., "1-5")
- start, end = part.split('-')
- selected_ids.extend(range(int(start), int(end) + 1))
- else:
- # Handle single ID
- selected_ids.append(int(part))
- # Validate that all IDs exist
- valid_ids = df_titles['title_id'].tolist()
- invalid_ids = [id for id in selected_ids if id not in valid_ids]
- if invalid_ids:
- print(f"❌ Invalid IDs: {invalid_ids}. Please try again.\n")
- continue
- if not selected_ids:
- print("❌ No IDs selected. Please try again.\n")
- continue
- # Show confirmation
- selected_titles = df_titles[df_titles['title_id'].isin(selected_ids)]
- print(f"\n✓ You selected {len(selected_ids)} book(s):")
- for _, row in selected_titles.iterrows():
- print(f" - ID {row['title_id']}: {row['book_title']}")
- confirm = input("\nProceed with these books? (y/n): ").strip().lower()
- if confirm == 'y':
- return selected_ids
- else:
- print("\nLet's try again.\n")
- except ValueError:
- print("❌ Invalid format. Please use numbers, commas, and ranges (e.g., 1,3,5-8)\n")
- # ============= Main pipeline =============
- def main():
- log.info("Starting voc.py")
- # load cache
- cache = load_cache()
- # open DB
- if not os.path.exists(DB_PATH):
- log.error("Database not found: %s", DB_PATH)
- return
- conn = sqlite3.connect(DB_PATH)
- try:
- df_vocab = pd.read_sql_query("SELECT * FROM vocabulary;", conn)
- df_titles = pd.read_sql_query("SELECT * FROM title;", conn)
- finally:
- conn.close()
- # Normalize title table columns
- df_titles = df_titles.rename(columns={"id": "title_id", "name": "book_title"})
- # Interactive title selection
- selected_title_ids = select_titles(df_titles)
- log.info("User selected %d book(s): %s", len(selected_title_ids), selected_title_ids)
- # Filter vocabulary by selected titles
- df_vocab_filtered = df_vocab[df_vocab['title_id'].isin(selected_title_ids)]
- log.info("Filtered to %d vocabulary rows from selected books", len(df_vocab_filtered))
- # Merge
- df = df_vocab_filtered.merge(df_titles[["title_id", "book_title"]], on="title_id", how="left")
- log.info("Loaded %d vocabulary rows", len(df))
- # Combine context
- df["context_full"] = (
- df["prev_context"].fillna("") + " " +
- df["word"].fillna("") + " " +
- df["next_context"].fillna("")
- ).str.strip()
- # Prepare session
- session = requests.Session()
- # Fetch definitions (with cache)
- words = df["word"].astype(str).tolist()
- log.info("Fetching definitions for %d words", len(words))
- defs = []
- for i, w in enumerate(words, start=1):
- d = get_wiktionary_definition(w, session, cache)
- defs.append(d)
- if i % 50 == 0:
- log.info("Progress: %d/%d", i, len(words))
- save_cache(cache) # periodic save
- df["definition"] = defs
- # Add bold formatting to word in context
- log.info("Adding bold formatting to words in context")
- df["context_bold"] = df.apply(
- lambda row: add_bold_to_word_in_context(row["context_full"], row["word"]),
- axis=1
- )
- # Sheet 1: Full data with bold word in context
- df_full = df[["book_title", "word", "context_bold", "definition"]]
- df_full = df_full.rename(columns={"context_bold": "context"})
- # Export to Excel with multiple sheets
- with pd.ExcelWriter(OUTPUT_EXCEL, engine='openpyxl') as writer:
- df_full.to_excel(writer, sheet_name='Full Data', index=False)
- # Sheet 2: Quizlet format with formulas linking to Full Data
- ws_quizlet = writer.book.create_sheet('Quizlet Import')
- # Write headers
- ws_quizlet['A1'] = 'book_title'
- ws_quizlet['B1'] = 'word'
- ws_quizlet['C1'] = 'definition'
- # Write formulas for each row (starting at row 2)
- for i in range(2, len(df_full) + 2):
- ws_quizlet[f'A{i}'] = f"='Full Data'!A{i}"
- ws_quizlet[f'B{i}'] = f"='Full Data'!B{i}"
- ws_quizlet[f'C{i}'] = f"='Full Data'!D{i}"
- log.info("Wrote output to %s with 2 sheets", OUTPUT_EXCEL)
- log.info(" - 'Full Data': Complete vocabulary list with bold words")
- log.info(" - 'Quizlet Import': 3-column format (book_title, context, definition)")
- # save cache
- save_cache(cache)
- log.info("Done.")
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment