Hedwige66

Koreader vocabulary builder exporter

Dec 9th, 2025
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.96 KB | Source Code | 0 0
  1. #!/usr/bin/env python3
  2. """
  3. voc.py
  4. SQLite -> Excel exporter for your vocabulary DB with Wiktionary definitions.
  5. Generates:
  6.  - vocabulary_cleaned.xlsx (with 2 sheets: full data + Quizlet format)
  7.  - voc.log (detailed run log)
  8.  - definitions_cache.json (cached definitions)
  9. """
  10.  
  11. import sqlite3
  12. import pandas as pd
  13. import requests
  14. import time
  15. import logging
  16. import json
  17. import os
  18. import re
  19. from typing import Optional
  20.  
  21. # ============= CONFIG =============
  22. DB_PATH = "vocabulary_builder.sqlite3"
  23. OUTPUT_EXCEL = "vocabulary_cleaned.xlsx"
  24. LOG_FILE = "voc.log"
  25. CACHE_FILE = "definitions_cache.json"
  26.  
  27. # MediaWiki API endpoint
  28. MW_API = "https://en.wiktionary.org/w/api.php"
  29.  
  30. # Set a clear custom user agent per Wikimedia etiquette.
  31. # Replace [email protected] with your real contact if you plan high-volume use.
  32. USER_AGENT = "KoreaderVocabBot/1.0 (contact: [email protected])"
  33.  
  34. # Sleep time between API calls (seconds). Increase if you see rate limiting.
  35. API_SLEEP = 0.3
  36.  
  37. # ============= logging setup =============
  38. logging.basicConfig(
  39.     level=logging.INFO,
  40.     format="%(asctime)s %(levelname)s: %(message)s",
  41.     handlers=[
  42.         logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8"),
  43.         logging.StreamHandler()
  44.     ]
  45. )
  46. log = logging.getLogger("voc")
  47.  
  48. # ============= cache helpers =============
  49. def load_cache():
  50.     if os.path.exists(CACHE_FILE):
  51.         try:
  52.             with open(CACHE_FILE, "r", encoding="utf-8") as f:
  53.                 return json.load(f)
  54.         except Exception as e:
  55.             log.warning("Failed to read cache file: %s", e)
  56.     return {}
  57.  
  58. def save_cache(cache):
  59.     try:
  60.         with open(CACHE_FILE, "w", encoding="utf-8") as f:
  61.             json.dump(cache, f, ensure_ascii=False, indent=2)
  62.             log.info("Saved cache (%d entries) to %s", len(cache), CACHE_FILE)
  63.     except Exception as e:
  64.         log.error("Failed to write cache file: %s", e)
  65.  
  66. # ============= Definition cleaning =============
  67. def clean_definition(text: str) -> str:
  68.     """Remove all Wiki markup and formatting to get clean, readable text."""
  69.     if not text:
  70.         return ""
  71.    
  72.     # Remove {{lb|en|...}} language/context labels (like transitive, legal, etc.)
  73.     text = re.sub(r'\{\{lb\|[^}]+\}\}', '', text)
  74.    
  75.     # Remove {{defdate|...}} date annotations
  76.     text = re.sub(r'\{\{defdate\|[^}]+\}\}', '', text)
  77.    
  78.     # Remove {{R:...}} reference templates
  79.     text = re.sub(r'\{\{R:[^}]+\}\}', '', text)
  80.    
  81.     # Remove {{ISO 639|...}} and similar templates
  82.     text = re.sub(r'\{\{[Ii][Ss][Oo][^}]+\}\}', '', text)
  83.    
  84.     # Remove any remaining {{template}} tags
  85.     text = re.sub(r'\{\{[^}]+\}\}', '', text)
  86.    
  87.     # Remove <ref>...</ref> tags and their content
  88.     text = re.sub(r'<ref[^>]*>.*?</ref>', '', text, flags=re.DOTALL)
  89.    
  90.     # Remove <sup>, </sup> and other HTML tags
  91.     text = re.sub(r'<[^>]+>', '', text)
  92.    
  93.     # Convert [[link#anchor|display]] to just display text
  94.     text = re.sub(r'\[\[([^|\]]+#[^|\]]+\|)?([^\]]+)\]\]', r'\2', text)
  95.    
  96.     # Convert [[link|display]] to just display text
  97.     text = re.sub(r'\[\[([^|\]]+\|)?([^\]]+)\]\]', r'\2', text)
  98.    
  99.     # Remove any remaining [[ or ]]
  100.     text = text.replace('[[', '').replace(']]', '')
  101.    
  102.     # Remove bold/italic markup
  103.     text = text.replace("'''", "").replace("''", "")
  104.    
  105.     # Clean up multiple spaces
  106.     text = re.sub(r'\s+', ' ', text)
  107.    
  108.     # Remove leading/trailing whitespace and punctuation artifacts
  109.     text = text.strip()
  110.     text = text.strip(';.,')
  111.    
  112.     # Remove standalone periods or semicolons at the end
  113.     text = re.sub(r'[;,]\s*$', '', text)
  114.    
  115.     return text
  116.  
  117. # ============= Wiktionary lookup (robust) =============
  118. def parse_first_definition_from_wikitext(wikitext: str) -> str:
  119.     """Extract the first sense line starting with '#' from raw wikitext.
  120.       This is a pragmatic parser; Wiktionary pages are not strictly formatted,
  121.       so this attempts to pick a sensible first definition.
  122.    """
  123.     if not wikitext:
  124.         return ""
  125.     lines = wikitext.splitlines()
  126.     # Seek lines beginning with exactly '#' possibly preceded by ':*' or multiple '#' chars
  127.     for line in lines:
  128.         line_stripped = line.lstrip()
  129.         if line_stripped.startswith("#") and not line_stripped.startswith("##"):
  130.             # remove leading '#' and any ":" or whitespace
  131.             d = line_stripped.lstrip("#: ").strip()
  132.             # Clean the definition of all wiki markup
  133.             d = clean_definition(d)
  134.             # if definition is empty after cleanup, continue searching
  135.             if d and len(d) > 3:  # Avoid very short fragments
  136.                 return d
  137.     return ""
  138.  
  139. def get_wiktionary_definition(word: str, session: requests.Session, cache: dict) -> str:
  140.     """Return first definition for word. Uses cache. Logs detailed HTTP info."""
  141.     if not word:
  142.         return ""
  143.     word_key = word.strip()
  144.     if word_key in cache:
  145.         log.debug("cache hit: %s", word_key)
  146.         return cache[word_key]
  147.  
  148.     params = {
  149.         "action": "query",
  150.         "format": "json",
  151.         "prop": "revisions",
  152.         "rvprop": "content",
  153.         "titles": word_key,
  154.         "redirects": 1
  155.     }
  156.  
  157.     headers = {"User-Agent": USER_AGENT}
  158.  
  159.     # Try up to 3 times with backoff
  160.     for attempt in range(1, 4):
  161.         try:
  162.             log.info("GET %s (attempt %d) for word=%s", MW_API, attempt, word_key)
  163.             r = session.get(MW_API, params=params, headers=headers, timeout=10)
  164.             log.info("HTTP %s for %s (len=%d)", r.status_code, word_key, len(r.content) if r.content else 0)
  165.             # Log small response snippet for debugging (first 300 chars)
  166.             snippet = r.text[:300].replace("\n", " ")
  167.             log.debug("Response snippet: %s", snippet)
  168.  
  169.             if r.status_code == 200:
  170.                 data = r.json()
  171.                 pages = data.get("query", {}).get("pages", {})
  172.                 if not pages:
  173.                     log.warning("No pages found for %s", word_key)
  174.                     result = ""
  175.                 else:
  176.                     page = next(iter(pages.values()))
  177.                     # page may have 'revisions' with content under '*'
  178.                     revs = page.get("revisions")
  179.                     if revs and isinstance(revs, list) and len(revs) > 0:
  180.                         content = revs[0].get("*", "")
  181.                         # Parse first definition
  182.                         definition = parse_first_definition_from_wikitext(content)
  183.                         if definition:
  184.                             result = definition
  185.                         else:
  186.                             # fallback: try extracts (plain text)
  187.                             log.debug("No # line found in wikitext for %s; trying extracts fallback", word_key)
  188.                             # second API call for extracts
  189.                             params2 = {
  190.                                 "action": "query",
  191.                                 "format": "json",
  192.                                 "prop": "extracts",
  193.                                 "titles": word_key,
  194.                                 "explaintext": True,
  195.                                 "redirects": 1
  196.                             }
  197.                             r2 = session.get(MW_API, params=params2, headers=headers, timeout=10)
  198.                             log.info("Fallback extracts HTTP %s for %s", r2.status_code, word_key)
  199.                             text = r2.json().get("query", {}).get("pages", {})
  200.                             page2 = next(iter(text.values()))
  201.                             extract = page2.get("extract", "")
  202.                             # try to find a definition line starting with '#'
  203.                             result = ""
  204.                             if extract:
  205.                                 for ll in extract.splitlines():
  206.                                     if ll.strip().startswith("#"):
  207.                                         cleaned = clean_definition(ll.strip().lstrip("#: ").strip())
  208.                                         if cleaned and len(cleaned) > 3:
  209.                                             result = cleaned
  210.                                             break
  211.                             # last resort: use first non-empty paragraph
  212.                             if not result:
  213.                                 paragraphs = [p.strip() for p in extract.split("\n\n") if p.strip()]
  214.                                 result = clean_definition(paragraphs[0]) if paragraphs else ""
  215.                     else:
  216.                         log.warning("Page has no revisions for %s", word_key)
  217.                         result = ""
  218.                 # store in cache and return
  219.                 cache[word_key] = result
  220.                 # be nice to the API
  221.                 time.sleep(API_SLEEP)
  222.                 return result
  223.             elif r.status_code in (429, 503):
  224.                 wait = 2 ** attempt
  225.                 log.warning("Rate-limited or server busy (HTTP %s). Backing off %ds", r.status_code, wait)
  226.                 time.sleep(wait)
  227.                 continue
  228.             else:
  229.                 log.warning("Unexpected HTTP %s for %s", r.status_code, word_key)
  230.                 break
  231.         except requests.RequestException as e:
  232.             log.exception("Request error for %s: %s", word_key, e)
  233.             time.sleep(1 + attempt)
  234.             continue
  235.  
  236.     # on failure return empty
  237.     cache[word_key] = ""
  238.     return ""
  239.  
  240. # ============= Context formatting =============
  241. def add_bold_to_word_in_context(context: str, word: str) -> str:
  242.     """Add **word** formatting around the word in the context."""
  243.     if not context or not word:
  244.         return context
  245.    
  246.     # Escape special regex characters in word
  247.     word_escaped = re.escape(word)
  248.    
  249.     # Case-insensitive replacement with word boundaries
  250.     # This handles the word appearing with different cases
  251.     pattern = r'\b(' + word_escaped + r')\b'
  252.     result = re.sub(pattern, r'**\1**', context, count=1, flags=re.IGNORECASE)
  253.    
  254.     return result
  255.  
  256. # ============= Interactive title selection =============
  257. def select_titles(df_titles):
  258.     """Display available titles and let user select which ones to include."""
  259.     print("\n" + "="*60)
  260.     print("AVAILABLE BOOKS IN DATABASE")
  261.     print("="*60)
  262.    
  263.     # Display all titles with their IDs
  264.     for _, row in df_titles.iterrows():
  265.         title_id = row['title_id']
  266.         book_title = row['book_title']
  267.         print(f"ID {title_id:3d}: {book_title}")
  268.    
  269.     print("="*60)
  270.     print("\nEnter the book IDs you want to export.")
  271.     print("Examples:")
  272.     print("  - Single book: 5")
  273.     print("  - Multiple books: 1,3,5")
  274.     print("  - Range: 1-5")
  275.     print("  - Mixed: 1,3,5-8,10")
  276.     print("  - All books: all")
  277.     print()
  278.    
  279.     while True:
  280.         user_input = input("Your selection: ").strip().lower()
  281.        
  282.         if user_input == "all":
  283.             return df_titles['title_id'].tolist()
  284.        
  285.         try:
  286.             selected_ids = []
  287.             parts = user_input.split(',')
  288.            
  289.             for part in parts:
  290.                 part = part.strip()
  291.                 if '-' in part:
  292.                     # Handle range (e.g., "1-5")
  293.                     start, end = part.split('-')
  294.                     selected_ids.extend(range(int(start), int(end) + 1))
  295.                 else:
  296.                     # Handle single ID
  297.                     selected_ids.append(int(part))
  298.            
  299.             # Validate that all IDs exist
  300.             valid_ids = df_titles['title_id'].tolist()
  301.             invalid_ids = [id for id in selected_ids if id not in valid_ids]
  302.            
  303.             if invalid_ids:
  304.                 print(f"❌ Invalid IDs: {invalid_ids}. Please try again.\n")
  305.                 continue
  306.            
  307.             if not selected_ids:
  308.                 print("❌ No IDs selected. Please try again.\n")
  309.                 continue
  310.            
  311.             # Show confirmation
  312.             selected_titles = df_titles[df_titles['title_id'].isin(selected_ids)]
  313.             print(f"\n✓ You selected {len(selected_ids)} book(s):")
  314.             for _, row in selected_titles.iterrows():
  315.                 print(f"  - ID {row['title_id']}: {row['book_title']}")
  316.            
  317.             confirm = input("\nProceed with these books? (y/n): ").strip().lower()
  318.             if confirm == 'y':
  319.                 return selected_ids
  320.             else:
  321.                 print("\nLet's try again.\n")
  322.                
  323.         except ValueError:
  324.             print("❌ Invalid format. Please use numbers, commas, and ranges (e.g., 1,3,5-8)\n")
  325.  
  326. # ============= Main pipeline =============
  327. def main():
  328.     log.info("Starting voc.py")
  329.     # load cache
  330.     cache = load_cache()
  331.  
  332.     # open DB
  333.     if not os.path.exists(DB_PATH):
  334.         log.error("Database not found: %s", DB_PATH)
  335.         return
  336.  
  337.     conn = sqlite3.connect(DB_PATH)
  338.     try:
  339.         df_vocab = pd.read_sql_query("SELECT * FROM vocabulary;", conn)
  340.         df_titles = pd.read_sql_query("SELECT * FROM title;", conn)
  341.     finally:
  342.         conn.close()
  343.  
  344.     # Normalize title table columns
  345.     df_titles = df_titles.rename(columns={"id": "title_id", "name": "book_title"})
  346.    
  347.     # Interactive title selection
  348.     selected_title_ids = select_titles(df_titles)
  349.     log.info("User selected %d book(s): %s", len(selected_title_ids), selected_title_ids)
  350.    
  351.     # Filter vocabulary by selected titles
  352.     df_vocab_filtered = df_vocab[df_vocab['title_id'].isin(selected_title_ids)]
  353.     log.info("Filtered to %d vocabulary rows from selected books", len(df_vocab_filtered))
  354.    
  355.     # Merge
  356.     df = df_vocab_filtered.merge(df_titles[["title_id", "book_title"]], on="title_id", how="left")
  357.     log.info("Loaded %d vocabulary rows", len(df))
  358.  
  359.     # Combine context
  360.     df["context_full"] = (
  361.         df["prev_context"].fillna("") + " " +
  362.         df["word"].fillna("") + " " +
  363.         df["next_context"].fillna("")
  364.     ).str.strip()
  365.  
  366.     # Prepare session
  367.     session = requests.Session()
  368.  
  369.     # Fetch definitions (with cache)
  370.     words = df["word"].astype(str).tolist()
  371.     log.info("Fetching definitions for %d words", len(words))
  372.     defs = []
  373.     for i, w in enumerate(words, start=1):
  374.         d = get_wiktionary_definition(w, session, cache)
  375.         defs.append(d)
  376.         if i % 50 == 0:
  377.             log.info("Progress: %d/%d", i, len(words))
  378.             save_cache(cache)  # periodic save
  379.  
  380.     df["definition"] = defs
  381.  
  382.     # Add bold formatting to word in context
  383.     log.info("Adding bold formatting to words in context")
  384.     df["context_bold"] = df.apply(
  385.         lambda row: add_bold_to_word_in_context(row["context_full"], row["word"]),
  386.         axis=1
  387.     )
  388.  
  389.     # Sheet 1: Full data with bold word in context
  390.     df_full = df[["book_title", "word", "context_bold", "definition"]]
  391.     df_full = df_full.rename(columns={"context_bold": "context"})
  392.  
  393.     # Export to Excel with multiple sheets
  394.     with pd.ExcelWriter(OUTPUT_EXCEL, engine='openpyxl') as writer:
  395.         df_full.to_excel(writer, sheet_name='Full Data', index=False)
  396.        
  397.         # Sheet 2: Quizlet format with formulas linking to Full Data
  398.         ws_quizlet = writer.book.create_sheet('Quizlet Import')
  399.        
  400.         # Write headers
  401.         ws_quizlet['A1'] = 'book_title'
  402.         ws_quizlet['B1'] = 'word'
  403.         ws_quizlet['C1'] = 'definition'
  404.        
  405.         # Write formulas for each row (starting at row 2)
  406.         for i in range(2, len(df_full) + 2):
  407.             ws_quizlet[f'A{i}'] = f"='Full Data'!A{i}"
  408.             ws_quizlet[f'B{i}'] = f"='Full Data'!B{i}"
  409.             ws_quizlet[f'C{i}'] = f"='Full Data'!D{i}"
  410.    
  411.     log.info("Wrote output to %s with 2 sheets", OUTPUT_EXCEL)
  412.     log.info("  - 'Full Data': Complete vocabulary list with bold words")
  413.     log.info("  - 'Quizlet Import': 3-column format (book_title, context, definition)")
  414.  
  415.     # save cache
  416.     save_cache(cache)
  417.     log.info("Done.")
  418.  
  419. if __name__ == "__main__":
  420.     main()
Advertisement
Add Comment
Please, Sign In to add comment