Koreader vocabulary builder exporter

#!/usr/bin/env python3
"""
voc.py
SQLite -> Excel exporter for your vocabulary DB with Wiktionary definitions.
Generates:
  - vocabulary_cleaned.xlsx (with 2 sheets: full data + Quizlet format)
  - voc.log (detailed run log)
  - definitions_cache.json (cached definitions)
"""

import sqlite3
import pandas as pd
import requests
import time
import logging
import json
import os
import re
from typing import Optional

# ============= CONFIG =============
DB_PATH = "vocabulary_builder.sqlite3"
OUTPUT_EXCEL = "vocabulary_cleaned.xlsx"
LOG_FILE = "voc.log"
CACHE_FILE = "definitions_cache.json"

# MediaWiki API endpoint
MW_API = "https://en.wiktionary.org/w/api.php"

# Set a clear custom user agent per Wikimedia etiquette.
# Replace [email protected] with your real contact if you plan high-volume use.
USER_AGENT = "KoreaderVocabBot/1.0 (contact: [email protected])"

# Sleep time between API calls (seconds). Increase if you see rate limiting.
API_SLEEP = 0.3

# ============= logging setup =============
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8"),
        logging.StreamHandler()
    ]
)
log = logging.getLogger("voc")

# ============= cache helpers =============
def load_cache():
    if os.path.exists(CACHE_FILE):
        try:
            with open(CACHE_FILE, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception as e:
            log.warning("Failed to read cache file: %s", e)
    return {}

def save_cache(cache):
    try:
        with open(CACHE_FILE, "w", encoding="utf-8") as f:
            json.dump(cache, f, ensure_ascii=False, indent=2)
            log.info("Saved cache (%d entries) to %s", len(cache), CACHE_FILE)
    except Exception as e:
        log.error("Failed to write cache file: %s", e)

# ============= Definition cleaning =============
def clean_definition(text: str) -> str:
    """Remove all Wiki markup and formatting to get clean, readable text."""
    if not text:
        return ""

    # Remove {{lb|en|...}} language/context labels (like transitive, legal, etc.)
    text = re.sub(r'\{\{lb\|[^}]+\}\}', '', text)

    # Remove {{defdate|...}} date annotations
    text = re.sub(r'\{\{defdate\|[^}]+\}\}', '', text)

    # Remove {{R:...}} reference templates
    text = re.sub(r'\{\{R:[^}]+\}\}', '', text)

    # Remove {{ISO 639|...}} and similar templates
    text = re.sub(r'\{\{[Ii][Ss][Oo][^}]+\}\}', '', text)

    # Remove any remaining {{template}} tags
    text = re.sub(r'\{\{[^}]+\}\}', '', text)

    # Remove <ref>...</ref> tags and their content
    text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)

    # Remove <sup>, </sup> and other HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Convert [[link#anchor|display]] to just display text
    text = re.sub(r'\[\[([^|\]]+#[^|\]]+\|)?([^\]]+)\]\]', r'\2', text)

    # Convert [[link|display]] to just display text
    text = re.sub(r'\[\[([^|\]]+\|)?([^\]]+)\]\]', r'\2', text)

    # Remove any remaining [[ or ]]
    text = text.replace('[[', '').replace(']]', '')

    # Remove bold/italic markup
    text = text.replace("'''", "").replace("''", "")

    # Clean up multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove leading/trailing whitespace and punctuation artifacts
    text = text.strip()
    text = text.strip(';.,')

    # Remove standalone periods or semicolons at the end
    text = re.sub(r'[;,]\s*$', '', text)

    return text

# ============= Wiktionary lookup (robust) =============
def parse_first_definition_from_wikitext(wikitext: str) -> str:
    """Extract the first sense line starting with '#' from raw wikitext.
       This is a pragmatic parser; Wiktionary pages are not strictly formatted,
       so this attempts to pick a sensible first definition.
    """
    if not wikitext:
        return ""
    lines = wikitext.splitlines()
    # Seek lines beginning with exactly '#' possibly preceded by ':*' or multiple '#' chars
    for line in lines:
        line_stripped = line.lstrip()
        if line_stripped.startswith("#") and not line_stripped.startswith("##"):
            # remove leading '#' and any ":" or whitespace
            d = line_stripped.lstrip("#: ").strip()
            # Clean the definition of all wiki markup
            d = clean_definition(d)
            # if definition is empty after cleanup, continue searching
            if d and len(d) > 3:  # Avoid very short fragments
                return d
    return ""

def get_wiktionary_definition(word: str, session: requests.Session, cache: dict) -> str:
    """Return first definition for word. Uses cache. Logs detailed HTTP info."""
    if not word:
        return ""
    word_key = word.strip()
    if word_key in cache:
        log.debug("cache hit: %s", word_key)
        return cache[word_key]

    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "rvprop": "content",
        "titles": word_key,
        "redirects": 1
    }

    headers = {"User-Agent": USER_AGENT}

    # Try up to 3 times with backoff
    for attempt in range(1, 4):
        try:
            log.info("GET %s (attempt %d) for word=%s", MW_API, attempt, word_key)
            r = session.get(MW_API, params=params, headers=headers, timeout=10)
            log.info("HTTP %s for %s (len=%d)", r.status_code, word_key, len(r.content) if r.content else 0)
            # Log small response snippet for debugging (first 300 chars)
            snippet = r.text[:300].replace("\n", " ")
            log.debug("Response snippet: %s", snippet)

            if r.status_code == 200:
                data = r.json()
                pages = data.get("query", {}).get("pages", {})
                if not pages:
                    log.warning("No pages found for %s", word_key)
                    result = ""
                else:
                    page = next(iter(pages.values()))
                    # page may have 'revisions' with content under '*'
                    revs = page.get("revisions")
                    if revs and isinstance(revs, list) and len(revs) > 0:
                        content = revs[0].get("*", "")
                        # Parse first definition
                        definition = parse_first_definition_from_wikitext(content)
                        if definition:
                            result = definition
                        else:
                            # fallback: try extracts (plain text)
                            log.debug("No # line found in wikitext for %s; trying extracts fallback", word_key)
                            # second API call for extracts
                            params2 = {
                                "action": "query",
                                "format": "json",
                                "prop": "extracts",
                                "titles": word_key,
                                "explaintext": True,
                                "redirects": 1
                            }
                            r2 = session.get(MW_API, params=params2, headers=headers, timeout=10)
                            log.info("Fallback extracts HTTP %s for %s", r2.status_code, word_key)
                            text = r2.json().get("query", {}).get("pages", {})
                            page2 = next(iter(text.values()))
                            extract = page2.get("extract", "")
                            # try to find a definition line starting with '#'
                            result = ""
                            if extract:
                                for ll in extract.splitlines():
                                    if ll.strip().startswith("#"):
                                        cleaned = clean_definition(ll.strip().lstrip("#: ").strip())
                                        if cleaned and len(cleaned) > 3:
                                            result = cleaned
                                            break
                            # last resort: use first non-empty paragraph
                            if not result:
                                paragraphs = [p.strip() for p in extract.split("\n\n") if p.strip()]
                                result = clean_definition(paragraphs[0]) if paragraphs else ""
                    else:
                        log.warning("Page has no revisions for %s", word_key)
                        result = ""
                # store in cache and return
                cache[word_key] = result
                # be nice to the API
                time.sleep(API_SLEEP)
                return result
            elif r.status_code in (429, 503):
                wait = 2 ** attempt
                log.warning("Rate-limited or server busy (HTTP %s). Backing off %ds", r.status_code, wait)
                time.sleep(wait)
                continue
            else:
                log.warning("Unexpected HTTP %s for %s", r.status_code, word_key)
                break
        except requests.RequestException as e:
            log.exception("Request error for %s: %s", word_key, e)
            time.sleep(1 + attempt)
            continue

    # on failure return empty
    cache[word_key] = ""
    return ""

# ============= Context formatting =============
def add_bold_to_word_in_context(context: str, word: str) -> str:
    """Add **word** formatting around the word in the context."""
    if not context or not word:
        return context

    # Escape special regex characters in word
    word_escaped = re.escape(word)

    # Case-insensitive replacement with word boundaries
    # This handles the word appearing with different cases
    pattern = r'\b(' + word_escaped + r')\b'
    result = re.sub(pattern, r'**\1**', context, count=1, flags=re.IGNORECASE)

    return result

# ============= Interactive title selection =============
def select_titles(df_titles):
    """Display available titles and let user select which ones to include."""
    print("\n" + "="*60)
    print("AVAILABLE BOOKS IN DATABASE")
    print("="*60)

    # Display all titles with their IDs
    for _, row in df_titles.iterrows():
        title_id = row['title_id']
        book_title = row['book_title']
        print(f"ID {title_id:3d}: {book_title}")

    print("="*60)
    print("\nEnter the book IDs you want to export.")
    print("Examples:")
    print("  - Single book: 5")
    print("  - Multiple books: 1,3,5")
    print("  - Range: 1-5")
    print("  - Mixed: 1,3,5-8,10")
    print("  - All books: all")
    print()

    while True:
        user_input = input("Your selection: ").strip().lower()

        if user_input == "all":
            return df_titles['title_id'].tolist()

        try:
            selected_ids = []
            parts = user_input.split(',')

            for part in parts:
                part = part.strip()
                if '-' in part:
                    # Handle range (e.g., "1-5")
                    start, end = part.split('-')
                    selected_ids.extend(range(int(start), int(end) + 1))
                else:
                    # Handle single ID
                    selected_ids.append(int(part))

            # Validate that all IDs exist
            valid_ids = df_titles['title_id'].tolist()
            invalid_ids = [id for id in selected_ids if id not in valid_ids]

            if invalid_ids:
                print(f"❌ Invalid IDs: {invalid_ids}. Please try again.\n")
                continue

            if not selected_ids:
                print("❌ No IDs selected. Please try again.\n")
                continue

            # Show confirmation
            selected_titles = df_titles[df_titles['title_id'].isin(selected_ids)]
            print(f"\n✓ You selected {len(selected_ids)} book(s):")
            for _, row in selected_titles.iterrows():
                print(f"  - ID {row['title_id']}: {row['book_title']}")

            confirm = input("\nProceed with these books? (y/n): ").strip().lower()
            if confirm == 'y':
                return selected_ids
            else:
                print("\nLet's try again.\n")

        except ValueError:
            print("❌ Invalid format. Please use numbers, commas, and ranges (e.g., 1,3,5-8)\n")

# ============= Main pipeline =============
def main():
    log.info("Starting voc.py")
    # load cache
    cache = load_cache()

    # open DB
    if not os.path.exists(DB_PATH):
        log.error("Database not found: %s", DB_PATH)
        return

    conn = sqlite3.connect(DB_PATH)
    try:
        df_vocab = pd.read_sql_query("SELECT * FROM vocabulary;", conn)
        df_titles = pd.read_sql_query("SELECT * FROM title;", conn)
    finally:
        conn.close()

    # Normalize title table columns
    df_titles = df_titles.rename(columns={"id": "title_id", "name": "book_title"})

    # Interactive title selection
    selected_title_ids = select_titles(df_titles)
    log.info("User selected %d book(s): %s", len(selected_title_ids), selected_title_ids)

    # Filter vocabulary by selected titles
    df_vocab_filtered = df_vocab[df_vocab['title_id'].isin(selected_title_ids)]
    log.info("Filtered to %d vocabulary rows from selected books", len(df_vocab_filtered))

    # Merge
    df = df_vocab_filtered.merge(df_titles[["title_id", "book_title"]], on="title_id", how="left")
    log.info("Loaded %d vocabulary rows", len(df))

    # Combine context
    df["context_full"] = (
        df["prev_context"].fillna("") + " " +
        df["word"].fillna("") + " " +
        df["next_context"].fillna("")
    ).str.strip()

    # Prepare session
    session = requests.Session()

    # Fetch definitions (with cache)
    words = df["word"].astype(str).tolist()
    log.info("Fetching definitions for %d words", len(words))
    defs = []
    for i, w in enumerate(words, start=1):
        d = get_wiktionary_definition(w, session, cache)
        defs.append(d)
        if i % 50 == 0:
            log.info("Progress: %d/%d", i, len(words))
            save_cache(cache)  # periodic save

    df["definition"] = defs

    # Add bold formatting to word in context
    log.info("Adding bold formatting to words in context")
    df["context_bold"] = df.apply(
        lambda row: add_bold_to_word_in_context(row["context_full"], row["word"]),
        axis=1
    )

    # Sheet 1: Full data with bold word in context
    df_full = df[["book_title", "word", "context_bold", "definition"]]
    df_full = df_full.rename(columns={"context_bold": "context"})

    # Export to Excel with multiple sheets
    with pd.ExcelWriter(OUTPUT_EXCEL, engine='openpyxl') as writer:
        df_full.to_excel(writer, sheet_name='Full Data', index=False)

        # Sheet 2: Quizlet format with formulas linking to Full Data
        ws_quizlet = writer.book.create_sheet('Quizlet Import')

        # Write headers
        ws_quizlet['A1'] = 'book_title'
        ws_quizlet['B1'] = 'word'
        ws_quizlet['C1'] = 'definition'

        # Write formulas for each row (starting at row 2)
        for i in range(2, len(df_full) + 2):
            ws_quizlet[f'A{i}'] = f"='Full Data'!A{i}"
            ws_quizlet[f'B{i}'] = f"='Full Data'!B{i}"
            ws_quizlet[f'C{i}'] = f"='Full Data'!D{i}"

    log.info("Wrote output to %s with 2 sheets", OUTPUT_EXCEL)
    log.info("  - 'Full Data': Complete vocabulary list with bold words")
    log.info("  - 'Quizlet Import': 3-column format (book_title, context, definition)")

    # save cache
    save_cache(cache)
    log.info("Done.")

if __name__ == "__main__":
    main()