Untitled

#!/usr/bin/env bun

/**
 * PDF Recovery Script
 *
 * Recovers a corrupted base64-encoded PDF that was OCR'd with errors.
 * Techniques: text substitutions, Adler-32 checksum repair,
 *   error-position-guided fuzzing with OCR confusion pairs,
 *   brute-force l→1 substitution.
 *
 * Memory-efficient: modifies stream data bytes in-place instead of
 * re-decoding the entire PDF for each trial.
 *
 * Usage: bun recover_pdf.ts <input.txt> <output.pdf>
 */

import { readFileSync, writeFileSync, existsSync, unlinkSync } from "fs";
import { inflateSync, inflateRawSync, deflateSync, constants } from "zlib";
import { spawnSync } from "child_process";

// ── Utilities ───────────────────────────────────────────────────────

function commandExists(cmd: string): boolean {
  return spawnSync("which", [cmd], { encoding: "utf-8" }).status === 0;
}

function runGhostscript(inputPath: string, outputPath: string): boolean {
  if (!commandExists("gs")) {
    console.error("\nGhostscript (gs) not found. Install with: brew install ghostscript");
    return false;
  }
  if (existsSync(outputPath)) unlinkSync(outputPath);
  spawnSync("gs", [
    "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
    "-dNOPAUSE", "-dQUIET", "-dBATCH",
    `-sOutputFile=${outputPath}`, inputPath
  ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
  return existsSync(outputPath);
}

function extractText(pdfPath: string): string {
  if (!commandExists("gs")) return "(gs not available)";
  const result = spawnSync("gs", [
    "-sDEVICE=txtwrite", "-sOutputFile=-",
    "-dQUIET", "-dNOPAUSE", "-dBATCH", pdfPath
  ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
  return result.stdout || "";
}

// ── PDF Token Dictionary ─────────────────────────────────────────────
// Known valid tokens in PDF files for auto-discovering OCR corruption.

const PDF_TOKENS: Set<string> = new Set([
  // Dictionary keys
  "/Type", "/Subtype", "/Filter", "/Length", "/Width", "/Height",
  "/Name", "/Names", "/Pages", "/Page", "/Root", "/Info", "/Size",
  "/Resources", "/Contents", "/MediaBox", "/CropBox", "/BleedBox",
  "/TrimBox", "/ArtBox", "/Font", "/XObject", "/ExtGState",
  "/ColorSpace", "/ProcSet", "/Encoding", "/BaseFont", "/Metadata",
  "/Predictor", "/Columns", "/Colors", "/DecodeParms", "/Decode",
  "/Linearized", "/Parent", "/Kids", "/Count", "/Rotate",
  "/Annots", "/Border", "/Rect", "/Dest", "/Action", "/URI", "/S",
  "/FontDescriptor", "/FontName", "/FontFile", "/FontFile2",
  "/FontFile3", "/Flags", "/ItalicAngle", "/StemV", "/StemH",
  "/Ascent", "/Descent", "/CapHeight", "/XHeight", "/MissingWidth",
  "/FontBBox", "/FirstChar", "/LastChar", "/Widths", "/AvgWidth",
  "/MaxWidth", "/Leading", "/ToUnicode", "/CIDSystemInfo",
  "/DescendantFonts", "/BaseEncoding", "/Differences", "/DW", "/W",
  "/BitsPerComponent", "/ImageMask", "/Mask", "/SMask",
  "/Interpolate", "/Intent", "/ColorTransform",
  "/ID", "/Index", "/Prev", "/N", "/O", "/P", "/E", "/T", "/H", "/L",
  "/Producer", "/Creator", "/CreationDate", "/ModDate", "/LastModified",
  "/Title", "/Author", "/Subject", "/Keywords", "/Trapped",
  "/GTS_PDFXVersion", "/OutputIntents", "/DestOutputProfile",
  "/MarkInfo", "/StructTreeRoot", "/Lang",
  "/ViewerPreferences", "/PageLayout", "/PageMode",
  "/Outlines", "/Threads", "/OpenAction", "/AcroForm", "/Fields",
  "/Prop_Build", "/Properties", "/Group", "/K", "/Pg",
  "/BBox", "/Matrix", "/FormType", "/OC", "/OCGs", "/OCProperties",
  "/Registry", "/Ordering", "/Supplement",
  // Type values
  "/Catalog", "/FontDescriptor", "/Image", "/Form",
  "/Annot", "/Link", "/Text", "/Widget", "/ObjStm", "/XRef",
  // Filter values
  "/FlateDecode", "/DCTDecode", "/ASCII85Decode", "/ASCIIHexDecode",
  "/LZWDecode", "/RunLengthDecode", "/CCITTFaxDecode",
  "/JBIG2Decode", "/JPXDecode", "/Crypt",
  // Color spaces
  "/DeviceRGB", "/DeviceCMYK", "/DeviceGray", "/DeviceN",
  "/ICCBased", "/Indexed", "/CalRGB", "/CalGray", "/Lab",
  "/Pattern", "/Separation",
  // ProcSet values
  "/PDF", "/ImageB", "/ImageC", "/ImageI",
  // XMP namespaces (bare, no leading /)
  "xmpG", "xmpMM", "xmpTPg", "stRef", "stFnt", "stEvt",
  "pdfx", "pdfaid",
  // Common parameter values (bare)
  "FlateDecode", "DCTDecode", "CMYK", "Name", "ProcSet",
  "Predictor", "mode", "LastModified",
]);

// ── OCR Confusion Model ─────────────────────────────────────────────
// Maps characters to what OCR commonly misreads them as (bidirectional).

const OCR_CONFUSIONS: Map<string, string[]> = new Map([
  [".", ["/"]],
  ["/", ["."]],
  ["l", ["1", "I"]],
  ["1", ["l", "I"]],
  ["I", ["l", "1", "M"]],
  ["O", ["0"]],
  ["0", ["O"]],
  ["m", ["i"]],
  ["i", ["m"]],
  ["M", ["I"]],
  ["c", ["b"]],
  ["b", ["c"]],
  ["5", ["S", "%"]],
  ["S", ["5"]],
  ["%", ["5"]],
  ["8", ["B"]],
  ["B", ["8"]],
  ["e", ["u"]],
  ["u", ["e"]],
]);

// ── Auto-discovery of OCR errors ────────────────────────────────────

interface DiscoveredFix {
  offset: number;
  original: string;
  replacement: string;
  rule: string;
}

/** Find plaintext regions (everything outside stream...endstream) */
function findPlaintextRegions(pdfStr: string): { start: number; end: number }[] {
  const regions: { start: number; end: number }[] = [];
  const streamRe = /stream[\r\n][\s\S]*?endstream/g;
  let lastEnd = 0;
  let m;
  while ((m = streamRe.exec(pdfStr)) !== null) {
    if (m.index > lastEnd) regions.push({ start: lastEnd, end: m.index });
    lastEnd = m.index + m[0].length;
  }
  if (lastEnd < pdfStr.length) regions.push({ start: lastEnd, end: pdfStr.length });
  return regions;
}

/** Try single-char OCR confusions to match a token against the dictionary */
function tryOCRConfusions(token: string): string | null {
  // 1-char substitutions
  for (let i = 0; i < token.length; i++) {
    const confusions = OCR_CONFUSIONS.get(token[i]);
    if (!confusions) continue;
    for (const sub of confusions) {
      if (sub.length !== 1) continue;
      const candidate = token.substring(0, i) + sub + token.substring(i + 1);
      if (PDF_TOKENS.has(candidate)) return candidate;
    }
  }
  // 2-char substitutions for longer tokens
  if (token.length > 4) {
    for (let i = 0; i < token.length; i++) {
      const c1 = OCR_CONFUSIONS.get(token[i]);
      if (!c1) continue;
      for (const s1 of c1) {
        if (s1.length !== 1) continue;
        const partial = token.substring(0, i) + s1 + token.substring(i + 1);
        for (let j = i + 1; j < partial.length; j++) {
          const c2 = OCR_CONFUSIONS.get(partial[j]);
          if (!c2) continue;
          for (const s2 of c2) {
            if (s2.length !== 1) continue;
            const candidate = partial.substring(0, j) + s2 + partial.substring(j + 1);
            if (PDF_TOKENS.has(candidate)) return candidate;
          }
        }
      }
    }
  }
  return null;
}

/**
 * Auto-discover and fix OCR errors in the decoded PDF using PDF spec knowledge.
 * Replaces hardcoded BASE64_SUBSTITUTIONS and DECODED_FIXES.
 */
function autoDiscoverAndFixOCRErrors(base64Clean: string): {
  base64Buf: Buffer; pdf: Buffer; pdfStr: string; fixCount: number;
} {
  let pdf = Buffer.from(base64Clean, 'base64');
  let pdfStr = pdf.toString('binary');
  const regions = findPlaintextRegions(pdfStr);
  const fixes: DiscoveredFix[] = [];
  const seen = new Set<number>();

  for (const region of regions) {
    const text = pdfStr.substring(region.start, region.end);

    // Pattern 1: PDF names — /Word or .Word (dot may be corrupted slash)
    const nameRe = /[./][A-Z][A-Za-z0-9_]*/g;
    let m;
    while ((m = nameRe.exec(text)) !== null) {
      const token = m[0];
      const absOff = region.start + m.index;
      if (seen.has(absOff)) continue;

      if (token.startsWith(".")) {
        // Try . → / directly
        const slashed = "/" + token.substring(1);
        if (PDF_TOKENS.has(slashed)) {
          fixes.push({ offset: absOff, original: token, replacement: slashed,
            rule: `. -> / (${slashed})` });
          seen.add(absOff);
          continue;
        }
        // Try . → / plus further OCR fix
        const further = tryOCRConfusions(slashed);
        if (further) {
          fixes.push({ offset: absOff, original: token, replacement: further,
            rule: `. -> / + OCR fix (${further})` });
          seen.add(absOff);
          continue;
        }
      }

      if (token.startsWith("/") && !PDF_TOKENS.has(token)) {
        const fixed = tryOCRConfusions(token);
        if (fixed) {
          fixes.push({ offset: absOff, original: token, replacement: fixed,
            rule: `OCR fix (${fixed})` });
          seen.add(absOff);
        }
      }
    }

    // Pattern 2: Bare words (XMP namespaces, parameter values)
    const wordRe = /(?<![/.\w])[a-zA-Z][a-zA-Z]{2,20}(?![a-zA-Z])/g;
    while ((m = wordRe.exec(text)) !== null) {
      const token = m[0];
      const absOff = region.start + m.index;
      if (seen.has(absOff) || PDF_TOKENS.has(token)) continue;
      const fixed = tryOCRConfusions(token);
      if (fixed) {
        fixes.push({ offset: absOff, original: token, replacement: fixed,
          rule: `bare word OCR fix (${fixed})` });
        seen.add(absOff);
      }
    }

    // Pattern 3: Structural . → / after >> or ] or digits
    const structRe = /(>>|]|\d)\.([\s\r\n/A-Z])/g;
    while ((m = structRe.exec(text)) !== null) {
      const dotOff = region.start + m.index + m[1].length;
      if (seen.has(dotOff)) continue;
      fixes.push({ offset: dotOff, original: ".", replacement: "/",
        rule: `. -> / after "${m[1]}"` });
      seen.add(dotOff);
    }

    // Pattern 4: % in hex context (likely misread 5)
    const hexRe = /%([0-9A-Fa-f]{4,})/g;
    while ((m = hexRe.exec(text)) !== null) {
      const absOff = region.start + m.index;
      if (seen.has(absOff)) continue;
      fixes.push({ offset: absOff, original: "%", replacement: "5",
        rule: `% -> 5 in hex context` });
      seen.add(absOff);
    }
  }

  // Apply fixes (reverse offset order to preserve positions)
  fixes.sort((a, b) => b.offset - a.offset);
  for (const fix of fixes) {
    pdfStr = pdfStr.substring(0, fix.offset) + fix.replacement +
             pdfStr.substring(fix.offset + fix.original.length);
  }

  // Log discoveries (sorted by offset for readability)
  const sorted = [...fixes].sort((a, b) => a.offset - b.offset);
  for (const fix of sorted) {
    console.log(`    "${fix.original}" -> "${fix.replacement}" [${fix.rule}]`);
  }

  if (fixes.length > 0) {
    pdf = Buffer.from(pdfStr, 'binary');
  }
  const base64Buf = Buffer.from(pdf.toString('base64'));
  return { base64Buf, pdf, pdfStr, fixCount: fixes.length };
}

// ── Base64 helpers ──────────────────────────────────────────────────

const B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const B64_VAL = new Uint8Array(128);
for (let i = 0; i < 64; i++) B64_VAL[B64_CHARS.charCodeAt(i)] = i;

const L_BYTE = 'l'.charCodeAt(0);
const ONE_BYTE = '1'.charCodeAt(0);

function pdfToB64(pdfPos: number): number {
  return Math.floor(pdfPos / 3) * 4;
}

/** Decode one base64 group (4 chars → 3 bytes) */
function decodeGroup(b64Buf: Buffer, groupStart: number): [number, number, number] {
  const a = B64_VAL[b64Buf[groupStart]];
  const b = B64_VAL[b64Buf[groupStart + 1]];
  const c = B64_VAL[b64Buf[groupStart + 2]];
  const d = B64_VAL[b64Buf[groupStart + 3]];
  return [(a << 2) | (b >> 4), ((b & 0xF) << 4) | (c >> 2), ((c & 3) << 6) | d];
}

/**
 * Apply a base64 char change to a stream data buffer IN PLACE.
 * Returns the old bytes so the change can be reverted.
 * Only modifies bytes that fall within the stream data range.
 */
function applyB64Change(
  b64Buf: Buffer, streamData: Buffer, streamStart: number,
  b64Pos: number, newChar: number
): { oldBytes: [number, number, number]; changed: boolean } {
  const orig = b64Buf[b64Pos];
  const groupStart = Math.floor(b64Pos / 4) * 4;
  const pdfByteStart = (groupStart / 4) * 3;

  // Get old decoded bytes for this group
  const old: [number, number, number] = [...decodeGroup(b64Buf, groupStart)] as [number, number, number];

  // Temporarily apply change and decode
  b64Buf[b64Pos] = newChar;
  const [n0, n1, n2] = decodeGroup(b64Buf, groupStart);
  b64Buf[b64Pos] = orig;

  let changed = false;
  const offsets = [pdfByteStart - streamStart, pdfByteStart - streamStart + 1, pdfByteStart - streamStart + 2];
  const newBytes = [n0, n1, n2];

  for (let k = 0; k < 3; k++) {
    if (offsets[k] >= 0 && offsets[k] < streamData.length && newBytes[k] !== streamData[offsets[k]]) {
      streamData[offsets[k]] = newBytes[k];
      changed = true;
    }
  }

  // Return the old values for the 3 positions (for reverting)
  return { oldBytes: old, changed };
}

/** Revert a base64 change on stream data */
function revertB64Change(
  streamData: Buffer, streamStart: number,
  b64Pos: number, oldBytes: [number, number, number]
): void {
  const groupStart = Math.floor(b64Pos / 4) * 4;
  const pdfByteStart = (groupStart / 4) * 3;
  for (let k = 0; k < 3; k++) {
    const off = pdfByteStart - streamStart + k;
    if (off >= 0 && off < streamData.length) {
      streamData[off] = oldBytes[k];
    }
  }
}

// ── OCR confusion pairs ─────────────────────────────────────────────

const CONFUSION_PAIRS: [number, number][] = [
  [0x6C, 0x31], [0x31, 0x6C], // l ↔ 1
  [0x4F, 0x30], [0x30, 0x4F], // O ↔ 0
  [0x49, 0x6C], [0x6C, 0x49], // I ↔ l
  [0x49, 0x31], [0x31, 0x49], // I ↔ 1
  [0x35, 0x53], [0x53, 0x35], // 5 ↔ S
  [0x38, 0x42], [0x42, 0x38], // 8 ↔ B
];

const SUBS_MAP = new Map<number, number[]>();
for (const [from, to] of CONFUSION_PAIRS) {
  if (!SUBS_MAP.has(from)) SUBS_MAP.set(from, []);
  const arr = SUBS_MAP.get(from)!;
  if (!arr.includes(to)) arr.push(to);
}

// ── Types ───────────────────────────────────────────────────────────

interface StreamInfo {
  obj: number;
  start: number;
  end: number;
  size: number;
  hasFilter: boolean;
}

// ── Stream detection ────────────────────────────────────────────────

function findAllStreams(pdfStr: string): StreamInfo[] {
  const streams: StreamInfo[] = [];
  const re = />>\s*stream[\r\n]+/g;
  let m;
  while ((m = re.exec(pdfStr)) !== null) {
    const dataStart = m.index + m[0].length;
    const lookback = pdfStr.substring(Math.max(0, m.index - 2000), m.index);
    const lastObj = lookback.lastIndexOf(' 0 obj');
    if (lastObj === -1) continue;
    const lastEndobj = lookback.lastIndexOf('endobj');
    if (lastEndobj > lastObj) continue;
    const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
    const numMatch = beforeObj.match(/(\d+)\s*$/);
    if (!numMatch) continue;
    const objNum = parseInt(numMatch[1]);
    const dictText = lookback.substring(lastObj);
    const end = pdfStr.indexOf('endstream', dataStart);
    if (end === -1) continue;
    const hasFilter = /FlateDecode/.test(dictText);
    streams.push({ obj: objNum, start: dataStart, end, size: end - dataStart, hasFilter });
  }
  return streams;
}

function findObjStream(pdfStr: string, objNum: number): StreamInfo | null {
  const pat = new RegExp('(?:^|[\\r\\n])' + objNum + '\\s+0\\s+obj', 'g');
  let om;
  while ((om = pat.exec(pdfStr)) !== null) {
    const objStart = om.index;
    const after = pdfStr.substring(objStart, objStart + 50000);
    const sm = after.match(/>>\s*stream[\r\n]+/);
    if (!sm) continue;
    const between = after.substring(0, sm.index! + sm[0].length);
    if (/endobj/.test(between)) continue;
    const start = objStart + sm.index! + sm[0].length;
    const end = pdfStr.indexOf('endstream', start);
    if (end === -1) continue;
    const hasFilter = /FlateDecode/.test(between);
    return { obj: objNum, start, end, size: end - start, hasFilter };
  }
  return null;
}

function findPageContentObjects(pdfStr: string): number[] {
  const contents: number[] = [];
  const re = /\/Type\s*\/Page\b(?!s)([\s\S]*?)(?=endobj)/g;
  let pm;
  while ((pm = re.exec(pdfStr)) !== null) {
    const chunk = pm[0] + pm[1];
    const single = chunk.match(/\/Contents\s+(\d+)\s+0\s+R/);
    if (single) contents.push(parseInt(single[1]));
    const arr = chunk.match(/\/Contents\s*\[([\d\s\nR]+)\]/);
    if (arr) {
      for (const ref of arr[1].matchAll(/(\d+)\s+0\s+R/g)) {
        contents.push(parseInt(ref[1]));
      }
    }
  }
  return [...new Set(contents)];
}

// ── Decompression helpers ───────────────────────────────────────────

function canInflate(data: Buffer): boolean {
  try { inflateSync(data); return true; } catch { return false; }
}

function getError(data: Buffer): string {
  try { inflateSync(data); return "OK"; }
  catch (e: any) { return String(e.message || e).replace(/^Error.*?:\s*/, ''); }
}

function findCorruptionOffset(streamData: Buffer): number {
  if (streamData.length < 3) return 0;
  const rawData = streamData.subarray(2);
  let lo = 0, hi = rawData.length;
  while (lo < hi - 1) {
    const mid = Math.floor((lo + hi) / 2);
    try {
      inflateRawSync(rawData.subarray(0, mid), { finishFlush: constants.Z_SYNC_FLUSH });
      lo = mid;
    } catch (e: any) {
      const msg = String(e.message || '');
      if (msg.includes('unexpected end') || msg.includes('buffer error') ||
          msg.includes('incomplete') || msg.includes('need dictionary')) {
        lo = mid;
      } else { hi = mid; }
    }
  }
  return lo + 2;
}

// ── Adler-32 checksum repair ────────────────────────────────────────

function computeAdler32(data: Buffer): number {
  let a = 1, b = 0;
  const MOD = 65521;
  for (let i = 0; i < data.length; i++) {
    a = (a + data[i]) % MOD;
    b = (b + a) % MOD;
  }
  return ((b << 16) | a) >>> 0;
}

function tryFixChecksum(pdf: Buffer, stream: StreamInfo): boolean {
  if (!stream.hasFilter || stream.size < 6) return false;

  // Strategy 1: Patch near the end (works when zlib data fills the stream)
  for (const trim of [2, 1, 0]) {
    const actualEnd = stream.end - trim;
    const streamData = pdf.subarray(stream.start, actualEnd);
    if (streamData.length < 6) continue;
    let decompressed: Buffer;
    try { decompressed = inflateRawSync(streamData.subarray(2)); } catch { continue; }
    const adler = computeAdler32(decompressed);
    const pos = actualEnd - 4;
    const old = [pdf[pos], pdf[pos+1], pdf[pos+2], pdf[pos+3]];
    pdf[pos]     = (adler >>> 24) & 0xFF;
    pdf[pos + 1] = (adler >>> 16) & 0xFF;
    pdf[pos + 2] = (adler >>>  8) & 0xFF;
    pdf[pos + 3] =  adler         & 0xFF;
    if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
    // Revert
    pdf[pos] = old[0]; pdf[pos+1] = old[1]; pdf[pos+2] = old[2]; pdf[pos+3] = old[3];
  }

  // Strategy 2: Find actual deflate data end (handles trailing data after checksum)
  const raw = pdf.subarray(stream.start + 2, stream.end);
  let decompressed: Buffer;
  try { decompressed = inflateRawSync(raw); } catch { return false; }
  // Binary search for minimum raw bytes needed (= deflate data length)
  let lo = 1, hi = raw.length;
  while (lo + 1 < hi) {
    const mid = Math.floor((lo + hi) / 2);
    try { inflateRawSync(raw.subarray(0, mid)); hi = mid; } catch { lo = mid; }
  }
  const deflateLen = hi;
  const adler = computeAdler32(decompressed);
  const checksumPos = stream.start + 2 + deflateLen;
  if (checksumPos + 4 > stream.end) return false;
  const old = [pdf[checksumPos], pdf[checksumPos+1], pdf[checksumPos+2], pdf[checksumPos+3]];
  pdf[checksumPos]     = (adler >>> 24) & 0xFF;
  pdf[checksumPos + 1] = (adler >>> 16) & 0xFF;
  pdf[checksumPos + 2] = (adler >>>  8) & 0xFF;
  pdf[checksumPos + 3] =  adler         & 0xFF;
  if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
  // Revert
  pdf[checksumPos] = old[0]; pdf[checksumPos+1] = old[1]; pdf[checksumPos+2] = old[2]; pdf[checksumPos+3] = old[3];
  return false;
}

/**
 * Fix streams where BFINAL bit is missing/corrupted.
 * Detects via: strict inflateRawSync fails with "unexpected end of file"
 * but Z_SYNC_FLUSH mode succeeds. Then brute-forces 1-2 byte changes
 * near the stream end to restore the BFINAL marker, followed by
 * Adler-32 checksum repair.
 */
function tryFixStreamEnd(pdf: Buffer, stream: StreamInfo): boolean {
  if (!stream.hasFilter || stream.size < 6) return false;

  const rawOrig = pdf.subarray(stream.start + 2, stream.end);
  // Only attempt if strict inflate fails with "unexpected end"
  try { inflateRawSync(rawOrig); return false; } catch (e: any) {
    if (!String(e.message || '').includes('unexpected end')) return false;
  }
  // Verify Z_SYNC_FLUSH works (data valid but BFINAL missing)
  try { inflateRawSync(rawOrig, { finishFlush: constants.Z_SYNC_FLUSH }); }
  catch { return false; }

  const raw = Buffer.from(rawOrig); // work on a copy
  const searchRange = Math.min(50, raw.length);
  const startIdx = Math.max(0, raw.length - searchRange);

  for (let i = startIdx; i < raw.length; i++) {
    const oi = raw[i];
    for (let v = 0; v < 256; v++) {
      if (v === oi) continue;
      raw[i] = v;
      let ok = false;
      try { inflateRawSync(raw); ok = true; } catch (e: any) {
        if (!String(e.message || '').includes('unexpected end')) {
          // Error shifted — search ±5 for second byte fix
          for (let j = Math.max(0, i - 5); j <= Math.min(raw.length - 1, i + 5); j++) {
            if (j === i) continue;
            const oj = raw[j];
            for (let w = 0; w < 256; w++) {
              if (w === oj) continue;
              raw[j] = w;
              try { inflateRawSync(raw); ok = true; } catch {}
              if (ok) {
                // Apply both changes to pdf, then fix checksum
                pdf[stream.start + 2 + i] = v;
                pdf[stream.start + 2 + j] = w;
                if (tryFixChecksum(pdf, stream)) return true;
                // Revert if checksum repair failed
                pdf[stream.start + 2 + i] = oi;
                pdf[stream.start + 2 + j] = oj;
                ok = false;
              }
            }
            raw[j] = oj;
          }
        }
      }
      if (ok) {
        // Single-byte fix succeeded
        pdf[stream.start + 2 + i] = v;
        if (tryFixChecksum(pdf, stream)) return true;
        pdf[stream.start + 2 + i] = oi; // revert
      }
      raw[i] = oi;
    }
  }

  return false;
}

// ── Visual repair: ObjStm alignment, color space, content truncation ─

/**
 * Fix ObjStm /First header alignment. If the header byte count doesn't
 * match the /First value in the dictionary, pad with spaces so GS can
 * find embedded objects.
 */
function fixObjStmAlignment(pdf: Buffer, pdfStr: string): string {
  const re = />>\s*stream[\r\n]+/g;
  let m;
  while ((m = re.exec(pdfStr)) !== null) {
    const start = m.index + m[0].length;
    const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
    const lastObj = lookback.lastIndexOf(' 0 obj');
    if (lastObj === -1) continue;
    const dictText = lookback.substring(lastObj + 6);
    if (!/\/ObjStm/.test(dictText)) continue;

    const end = pdfStr.indexOf('endstream', start);
    if (end === -1) continue;
    const data = pdf.subarray(start, end);
    let dec: Buffer;
    try { dec = inflateSync(data); } catch { continue; }
    const text = dec.toString('utf-8');

    const firstMatch = dictText.match(/\/First\s+(\d+)/);
    if (!firstMatch) continue;
    const firstValue = parseInt(firstMatch[1]);

    const headerMatch = text.match(/^([\d\s]+)/);
    if (!headerMatch) continue;
    const currentLen = headerMatch[0].length;

    if (currentLen === firstValue) continue;

    // Pad header to match /First
    const nums = headerMatch[1].trim().split(/\s+/).map(Number);
    let newHeader = '';
    for (let i = 0; i < nums.length; i += 2) {
      newHeader += nums[i] + ' ' + nums[i + 1] + ' ';
    }
    while (newHeader.length < firstValue) newHeader += ' ';
    const objData = text.substring(currentLen);
    const newContent = newHeader + objData;

    const compressed = deflateSync(Buffer.from(newContent));
    if (compressed.length > end - start) continue;

    compressed.copy(pdf, start);
    for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
    updateStreamLength(pdf, pdfStr, start, compressed.length);
    pdfStr = pdf.toString('binary');
  }
  return pdfStr;
}

/**
 * Auto-detect Separation color space → CMYK mappings from the PDF's
 * color space definition objects, then replace /CSn cs T scn operators
 * in content streams with direct CMYK k operators.
 *
 * For FunctionType 2 (exponential interpolation):
 *   CMYK = C0 + tint^N * (C1 - C0)
 */
function replaceColorSpacesWithCMYK(pdf: Buffer, pdfStr: string): string {
  // Step 1: Find /ColorSpace<<...>> in page Resource dictionaries
  // Maps CS name (e.g. "CS0") → object number
  const csRefs = new Map<string, number>();
  const csRefRe = /\/ColorSpace\s*<<([^>]+)>>/g;
  let csm;
  while ((csm = csRefRe.exec(pdfStr)) !== null) {
    const entries = csm[1].matchAll(/\/(CS\d+)\s+(\d+)\s+0\s+R/g);
    for (const e of entries) csRefs.set(e[1], parseInt(e[2]));
  }
  if (csRefs.size === 0) return pdfStr;

  // Step 2: Resolve each CS object to get C0, C1, N from the Separation definition
  // csMap: "CS0" → "1 0.57 0 0.38" (CMYK string for tint=1)
  const csMap = new Map<string, string>();
  for (const [csName, objNum] of csRefs) {
    const csContent = resolveObject(pdf, pdfStr, objNum);
    if (!csContent) continue;
    // Parse: [/Separation /name /DeviceCMYK << /C0[...] /C1[...] /N n ... >>]
    const c0Match = csContent.match(/\/C0\s*\[([^\]]+)\]/);
    const c1Match = csContent.match(/\/C1\s*\[([^\]]+)\]/);
    const nMatch = csContent.match(/\/N\s+([\d.]+)/);
    if (!c0Match || !c1Match) continue;
    const c0 = c0Match[1].trim().split(/\s+/).map(Number);
    const c1 = c1Match[1].trim().split(/\s+/).map(Number);
    const n = nMatch ? parseFloat(nMatch[1]) : 1;
    if (c0.length !== 4 || c1.length !== 4) continue;
    if (c0.some(isNaN) || c1.some(isNaN)) continue;
    // Compute CMYK at tint=1: C0 + 1^N * (C1 - C0) = C1
    const cmyk = c1.map((v, i) => {
      const val = c0[i] + Math.pow(1, n) * (v - c0[i]);
      return Number(val.toFixed(4)).toString();
    });
    csMap.set(csName, cmyk.join(' '));
    console.log(`    Color space /${csName} (obj ${objNum}): → CMYK(${cmyk.join(', ')})`);
  }
  if (csMap.size === 0) return pdfStr;

  // Step 3: Replace /CSn cs T scn → CMYK k in content streams
  const streamRe = />>\s*stream[\r\n]+/g;
  let m;
  while ((m = streamRe.exec(pdfStr)) !== null) {
    const start = m.index + m[0].length;
    const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
    const lastObj = lookback.lastIndexOf(' 0 obj');
    if (lastObj === -1) continue;
    const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
    const numMatch = beforeObj.match(/(\d+)\s*$/);
    if (!numMatch) continue;

    const end = pdfStr.indexOf('endstream', start);
    if (end === -1) continue;
    const data = pdf.subarray(start, end);
    let dec: Buffer;
    try { dec = inflateSync(data); } catch { continue; }
    let text = dec.toString('utf-8');

    const orig = text;
    for (const [csName, cmyk] of csMap) {
      // Match /<CSname> cs <tint> scn (with flexible whitespace)
      const pat = new RegExp(`\\/${csName}\\s+cs\\s+([\\d.]+)\\s+scn`, 'g');
      text = text.replace(pat, (_: string, tint: string) => {
        // For tint != 1, would need to recompute, but tint=1 is by far most common
        if (parseFloat(tint) === 1) return `${cmyk} k`;
        // For other tint values, leave unchanged (rare)
        return _;
      });
    }
    if (text === orig) continue;

    const compressed = deflateSync(Buffer.from(text));
    if (compressed.length > end - start) continue;
    compressed.copy(pdf, start);
    for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
    updateStreamLength(pdf, pdfStr, start, compressed.length);
    pdfStr = pdf.toString('binary');
  }
  return pdfStr;
}

/**
 * Resolve an object by number — check ObjStm streams first, then standalone objects.
 * Returns the object's text content, or null if not found/unreadable.
 */
function resolveObject(pdf: Buffer, pdfStr: string, objNum: number): string | null {
  // Try standalone object first
  const standaloneRe = new RegExp(`(?:^|[\\r\\n])${objNum}\\s+0\\s+obj\\b`);
  const sm = standaloneRe.exec(pdfStr);
  if (sm) {
    const objStart = sm.index + sm[0].length;
    const objEnd = pdfStr.indexOf('endobj', objStart);
    if (objEnd !== -1) return pdfStr.substring(objStart, objEnd).trim();
  }

  // Search ObjStm streams
  const re = />>\s*stream[\r\n]+/g;
  let m;
  while ((m = re.exec(pdfStr)) !== null) {
    const start = m.index + m[0].length;
    const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
    const lastObj = lookback.lastIndexOf(' 0 obj');
    if (lastObj === -1) continue;
    const dictText = lookback.substring(lastObj + 6);
    if (!/\/ObjStm/.test(dictText)) continue;
    const firstMatch = dictText.match(/\/First\s+(\d+)/);
    if (!firstMatch) continue;
    const first = parseInt(firstMatch[1]);
    const end = pdfStr.indexOf('endstream', start);
    if (end === -1) continue;
    const data = pdf.subarray(start, end);
    let dec: Buffer;
    try { dec = inflateSync(data); } catch { continue; }
    const text = dec.toString('utf-8');
    const headerMatch = text.match(/^([\d\s]+)/);
    if (!headerMatch) continue;
    const nums = headerMatch[1].trim().split(/\s+/).map(Number);
    for (let i = 0; i < nums.length; i += 2) {
      if (nums[i] !== objNum) continue;
      const offset = nums[i + 1];
      const nextOffset = i + 2 < nums.length ? nums[i + 3] : text.length - first;
      return text.substring(first + offset, first + nextOffset).trim();
    }
  }
  return null;
}

/**
 * Detect and truncate garbled content in decompressed page content streams.
 * OCR errors can produce wrong deflate output that inflates OK but contains
 * invalid PDF operators, causing accidental fills/clips.
 */
function truncateGarbledContentStreams(pdf: Buffer, pdfStr: string): string {
  const re = />>\s*stream[\r\n]+/g;
  let m;
  while ((m = re.exec(pdfStr)) !== null) {
    const start = m.index + m[0].length;
    const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
    const lastObj = lookback.lastIndexOf(' 0 obj');
    if (lastObj === -1) continue;
    const dictText = lookback.substring(lastObj + 6);
    // Only process content streams (FlateDecode, not ObjStm/XRef)
    if (!/FlateDecode/.test(dictText)) continue;
    if (/\/ObjStm|\/XRef/.test(dictText)) continue;

    const end = pdfStr.indexOf('endstream', start);
    if (end === -1) continue;
    const data = pdf.subarray(start, end);
    let dec: Buffer;
    try { dec = inflateSync(data); } catch { continue; }
    const text = dec.toString('utf-8');
    const lines = text.split('\n');

    // Find first garbled line
    let firstBad = -1;
    for (let i = 0; i < lines.length; i++) {
      const line = lines[i].trim();
      if (line.length === 0) continue;
      if (isGarbledLine(line)) { firstBad = i; break; }
    }
    if (firstBad === -1) continue;

    // Track BT/q state in good portion
    let btDepth = 0, qDepth = 0;
    for (let i = 0; i < firstBad; i++) {
      const line = lines[i].trim();
      const bt = line.match(/\bBT\b/g), et = line.match(/\bET\b/g);
      const q = line.match(/\bq\b/g), Q = line.match(/\bQ\b/g);
      if (bt) btDepth += bt.length;
      if (et) btDepth -= et.length;
      if (q) qDepth += q.length;
      if (Q) qDepth -= Q.length;
    }

    let truncated = lines.slice(0, firstBad).join('\n') + '\n';
    if (btDepth > 0) truncated += 'ET\n';
    while (qDepth > 0) { truncated += 'Q\n'; qDepth--; }

    const compressed = deflateSync(Buffer.from(truncated));
    if (compressed.length > end - start) continue;
    compressed.copy(pdf, start);
    for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
    updateStreamLength(pdf, pdfStr, start, compressed.length);
    pdfStr = pdf.toString('binary');
  }
  return pdfStr;
}

function isGarbledLine(line: string): boolean {
  // Non-printable chars outside parenthesized strings
  let inStr = false, depth = 0, bad = 0;
  for (let j = 0; j < line.length; j++) {
    const c = line.charCodeAt(j);
    if (line[j] === '(' && !inStr) { inStr = true; depth++; }
    else if (line[j] === '(' && inStr) depth++;
    else if (line[j] === ')' && inStr) { depth--; if (depth === 0) inStr = false; }
    if (!inStr && (c < 32 || c > 126) && c !== 9) bad++;
  }
  if (bad > 0) return true;
  // Operators merged with numbers: Q703, -26547Q
  if (/[a-zA-Z]{2}\d{3,}[a-zA-Z]/.test(line)) return true;
  if (/[QqfsSBWnhmc]\d{3,}/.test(line) && !/scn/.test(line)) return true;
  if (/\d{3,}[QqfhBWnm](?:\s|$)/.test(line)) return true;
  // Numbers with impossible dots: 0.5.148
  if (/\d+\.\d+\.\d+/.test(line) && !/Tm/.test(line) && !line.includes('(')) return true;
  // Very long numbers (garbled)
  if (/\d{6,}/.test(line) && !/Tm|cm/.test(line)) return true;
  return false;
}

/** Update /Length in stream dictionary after recompressing */
function updateStreamLength(pdf: Buffer, pdfStr: string, streamStart: number, newLen: number): void {
  // Search backward for the ">>" that ends the dictionary, then find /Length
  // within the dictionary only (not in preceding stream data).
  const area = pdfStr.substring(Math.max(0, streamStart - 500), streamStart);
  // Find the last ">>" before "stream" — that's the dictionary end
  const dictEnd = area.lastIndexOf('>>');
  if (dictEnd === -1) return;
  // Find the start of this object's dictionary (look for "obj")
  const objStart = area.lastIndexOf(' 0 obj');
  const dictStart = objStart >= 0 ? objStart : 0;
  const dictOnly = area.substring(dictStart, dictEnd + 2);
  const lengthMatch = dictOnly.match(/\/Length\s+(\d+)/);
  if (!lengthMatch) return;
  const oldLen = lengthMatch[1];
  const padded = String(newLen).padStart(oldLen.length, ' ');
  const offset = Math.max(0, streamStart - 500) + dictStart + dictOnly.lastIndexOf('/Length ' + oldLen) + 8;
  for (let i = 0; i < oldLen.length; i++) pdf[offset + i] = padded.charCodeAt(i);
}

// ── Memory-efficient brute-force l→1 ────────────────────────────────

/**
 * Try l→1 substitutions using direct stream data manipulation.
 * No full PDF re-decode per trial — only modifies affected bytes in
 * a pre-extracted stream data buffer, then tries inflate.
 */
function bruteForceL1(
  b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  maxDepth: number = 4,
): { fixed: boolean; changes: number; depth: number } {
  const b64Start = pdfToB64(stream.start);
  const b64End = Math.min(b64Buf.length, pdfToB64(stream.end) + 4);

  const lPos: number[] = [];
  for (let i = b64Start; i < b64End; i++) {
    if (b64Buf[i] === L_BYTE) lPos.push(i);
  }

  // Depth 1
  for (const p of lPos) {
    const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, p, ONE_BYTE);
    if (changed && canInflate(streamData)) {
      b64Buf[p] = ONE_BYTE;
      return { fixed: true, changes: 1, depth: 1 };
    }
    revertB64Change(streamData, stream.start, p, oldBytes);
  }
  if (maxDepth < 2) return { fixed: false, changes: 0, depth: 0 };

  // Depth 2
  for (let i = 0; i < lPos.length; i++) {
    const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
    b64Buf[lPos[i]] = ONE_BYTE;
    for (let j = i + 1; j < lPos.length; j++) {
      const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
      if (r2.changed && canInflate(streamData)) {
        b64Buf[lPos[j]] = ONE_BYTE;
        return { fixed: true, changes: 2, depth: 2 };
      }
      revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
    }
    b64Buf[lPos[i]] = L_BYTE;
    revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  }
  if (maxDepth < 3 || lPos.length > 50) return { fixed: false, changes: 0, depth: 0 };

  // Depth 3
  for (let i = 0; i < lPos.length; i++) {
    const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
    b64Buf[lPos[i]] = ONE_BYTE;
    for (let j = i + 1; j < lPos.length; j++) {
      const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
      b64Buf[lPos[j]] = ONE_BYTE;
      for (let k = j + 1; k < lPos.length; k++) {
        const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
        if (r3.changed && canInflate(streamData)) {
          b64Buf[lPos[k]] = ONE_BYTE;
          return { fixed: true, changes: 3, depth: 3 };
        }
        revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
      }
      b64Buf[lPos[j]] = L_BYTE;
      revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
    }
    b64Buf[lPos[i]] = L_BYTE;
    revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  }
  if (maxDepth < 4 || lPos.length > 45) return { fixed: false, changes: 0, depth: 0 };

  // Depth 4
  for (let i = 0; i < lPos.length; i++) {
    const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
    b64Buf[lPos[i]] = ONE_BYTE;
    for (let j = i + 1; j < lPos.length; j++) {
      const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
      b64Buf[lPos[j]] = ONE_BYTE;
      for (let k = j + 1; k < lPos.length; k++) {
        const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
        b64Buf[lPos[k]] = ONE_BYTE;
        for (let m = k + 1; m < lPos.length; m++) {
          const r4 = applyB64Change(b64Buf, streamData, stream.start, lPos[m], ONE_BYTE);
          if (r4.changed && canInflate(streamData)) {
            b64Buf[lPos[m]] = ONE_BYTE;
            return { fixed: true, changes: 4, depth: 4 };
          }
          revertB64Change(streamData, stream.start, lPos[m], r4.oldBytes);
        }
        b64Buf[lPos[k]] = L_BYTE;
        revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
      }
      b64Buf[lPos[j]] = L_BYTE;
      revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
    }
    b64Buf[lPos[i]] = L_BYTE;
    revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  }

  return { fixed: false, changes: 0, depth: 0 };
}

// ── Iterative error fixing ───────────────────────────────────────────

/**
 * Check if inflate can get past a known error position.
 * Returns true if the error at errPos is no longer present
 * (may still fail later at a different position).
 */
function passesErrPos(streamData: Buffer, errPos: number): boolean {
  if (errPos < 3 || streamData.length < 3) return false;
  const testLen = Math.min(streamData.length - 2, errPos + 3);
  try {
    inflateRawSync(streamData.subarray(2, 2 + testLen), { finishFlush: constants.Z_SYNC_FLUSH });
    return true;
  } catch (e: any) {
    const msg = String(e.message || '');
    return msg.includes('unexpected end') || msg.includes('buffer error') || msg.includes('incomplete');
  }
}

/**
 * Iteratively fix stream errors by finding and fixing one corruption at a time.
 * For each error position, tries all base64 substitutions in a window before the
 * error. Keeps changes that push the error position significantly forward.
 * Reverts all changes if the stream doesn't ultimately decompress.
 */
function iterativeErrorFix(
  b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  maxIter: number = 15,
  windowB64: number = 300,
): { fixed: boolean; totalChanges: number; desc: string } {
  const changes: { pos: number; from: number; oldBytes: [number, number, number] }[] = [];

  for (let iter = 0; iter < maxIter; iter++) {
    if (canInflate(streamData)) {
      return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes in ${iter} iter` };
    }

    const errPos = findCorruptionOffset(streamData);
    if (errPos >= streamData.length - 4) {
      // Error at end — try checksum repair
      if (tryFixChecksumOnData(streamData)) {
        return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes + checksum` };
      }
      break;
    }

    const errPdfPos = stream.start + errPos;
    const b64Center = pdfToB64(errPdfPos);
    const searchLo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
    const searchHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);

    let bestPos = -1;
    let bestTo = 0;
    let bestNewErrPos = errPos;
    let bestOldBytes: [number, number, number] = [0, 0, 0];

    for (let i = searchLo; i < searchHi; i++) {
      const origChar = b64Buf[i];
      // Try confusion pairs first (more likely correct), then all other chars
      const confPairs = SUBS_MAP.get(origChar) || [];
      const allChars: number[] = [...confPairs];
      for (let c = 0; c < 64; c++) {
        const ch = B64_CHARS.charCodeAt(c);
        if (ch !== origChar && !confPairs.includes(ch)) allChars.push(ch);
      }

      for (const newChar of allChars) {
        const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
        if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }

        if (canInflate(streamData)) {
          b64Buf[i] = newChar;
          return { fixed: true, totalChanges: changes.length + 1, desc: `${changes.length + 1} fixes` };
        }

        if (passesErrPos(streamData, errPos)) {
          const newErrPos = findCorruptionOffset(streamData);
          if (newErrPos > bestNewErrPos) {
            bestNewErrPos = newErrPos;
            bestPos = i;
            bestTo = newChar;
            bestOldBytes = [...oldBytes] as [number, number, number];
          }
        }

        revertB64Change(streamData, stream.start, i, oldBytes);
      }
    }

    if (bestPos === -1 || bestNewErrPos <= errPos) break;

    // Apply best change
    const origChar = b64Buf[bestPos];
    applyB64Change(b64Buf, streamData, stream.start, bestPos, bestTo);
    b64Buf[bestPos] = bestTo;
    changes.push({ pos: bestPos, from: origChar, oldBytes: bestOldBytes });
  }

  // Check one more time (might have fixed all data errors, just checksum left)
  if (canInflate(streamData)) {
    return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes` };
  }

  // Not fixed — revert all changes in reverse order
  for (let i = changes.length - 1; i >= 0; i--) {
    const ch = changes[i];
    applyB64Change(b64Buf, streamData, stream.start, ch.pos, ch.from);
    b64Buf[ch.pos] = ch.from;
  }
  const partial = changes.length > 0 ? ` (${changes.length} partial, reverted)` : '';
  return { fixed: false, totalChanges: 0, desc: `no convergence${partial}` };
}

/**
 * Try to fix Adler-32 checksum on a standalone stream data buffer.
 * Modifies the buffer in-place if successful.
 */
function tryFixChecksumOnData(streamData: Buffer): boolean {
  if (streamData.length < 6) return false;
  for (const trim of [2, 1, 0]) {
    const actualEnd = streamData.length - trim;
    if (actualEnd < 6) continue;
    let decompressed: Buffer;
    try { decompressed = inflateRawSync(streamData.subarray(2, actualEnd)); } catch { continue; }
    const adler = computeAdler32(decompressed);
    const pos = actualEnd - 4;
    const old = [streamData[pos], streamData[pos+1], streamData[pos+2], streamData[pos+3]];
    streamData[pos]     = (adler >>> 24) & 0xFF;
    streamData[pos + 1] = (adler >>> 16) & 0xFF;
    streamData[pos + 2] = (adler >>>  8) & 0xFF;
    streamData[pos + 3] =  adler         & 0xFF;
    if (canInflate(streamData)) return true;
    streamData[pos] = old[0]; streamData[pos+1] = old[1]; streamData[pos+2] = old[2]; streamData[pos+3] = old[3];
  }
  return false;
}

// ── DFS search with backtracking ─────────────────────────────────────

/**
 * Depth-first search for multi-error streams.
 * At each level, find the error position, try substitutions, and for the
 * top candidates that push past the error, recurse to fix the next error.
 * Backtracks if a path doesn't converge.
 */
function dfsErrorSearch(
  b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  maxDepth: number = 8,
  branchFactor: number = 2,
  windowB64: number = 200,
): { fixed: boolean; totalChanges: number; desc: string } {
  const applied: { pos: number; origChar: number }[] = [];
  let nodeCount = 0;
  const maxNodes = 500; // Limit total search nodes to prevent OOM

  function search(depth: number): boolean {
    if (canInflate(streamData)) return true;
    if (depth >= maxDepth || nodeCount >= maxNodes) {
      return tryFixChecksumOnData(streamData);
    }
    nodeCount++;

    const errPos = findCorruptionOffset(streamData);
    if (errPos >= streamData.length - 4) {
      return tryFixChecksumOnData(streamData);
    }

    const errPdfPos = stream.start + errPos;
    const b64Center = pdfToB64(errPdfPos);
    const lo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
    const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);

    // Find candidates — confusion pairs first, fallback to all chars
    const candidates: { pos: number; to: number; improvement: number }[] = [];

    // Pass 1: confusion pairs only
    for (let i = lo; i < hi; i++) {
      const origChar = b64Buf[i];
      const confPairs = SUBS_MAP.get(origChar);
      if (!confPairs) continue;

      for (const newChar of confPairs) {
        const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
        if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }

        if (canInflate(streamData)) {
          b64Buf[i] = newChar;
          applied.push({ pos: i, origChar });
          return true;
        }

        let improvement = 0;
        if (passesErrPos(streamData, errPos)) {
          const newErrPos = findCorruptionOffset(streamData);
          improvement = newErrPos - errPos;
        }

        revertB64Change(streamData, stream.start, i, oldBytes);

        if (improvement > 0) {
          candidates.push({ pos: i, to: newChar, improvement });
        }
      }
    }

    // Pass 2: if confusion pairs found nothing, try all chars in very tight window
    if (candidates.length === 0) {
      const tightLo = Math.max(pdfToB64(stream.start), b64Center - 40);
      const tightHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 10);
      for (let i = tightLo; i < tightHi; i++) {
        const origChar = b64Buf[i];
        for (let c = 0; c < 64; c++) {
          const newChar = B64_CHARS.charCodeAt(c);
          if (newChar === origChar) continue;
          if (SUBS_MAP.get(origChar)?.includes(newChar)) continue; // already tried

          const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
          if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }

          if (canInflate(streamData)) {
            b64Buf[i] = newChar;
            applied.push({ pos: i, origChar });
            return true;
          }

          let improvement = 0;
          if (passesErrPos(streamData, errPos)) {
            const newErrPos = findCorruptionOffset(streamData);
            improvement = newErrPos - errPos;
          }

          revertB64Change(streamData, stream.start, i, oldBytes);

          if (improvement > 0) {
            candidates.push({ pos: i, to: newChar, improvement });
          }
        }
      }
    }

    candidates.sort((a, b) => b.improvement - a.improvement);

    for (const cand of candidates.slice(0, branchFactor)) {
      if (nodeCount >= maxNodes) break;
      const origChar = b64Buf[cand.pos];
      applyB64Change(b64Buf, streamData, stream.start, cand.pos, cand.to);
      b64Buf[cand.pos] = cand.to;
      applied.push({ pos: cand.pos, origChar });

      if (search(depth + 1)) return true;

      applied.pop();
      applyB64Change(b64Buf, streamData, stream.start, cand.pos, origChar);
      b64Buf[cand.pos] = origChar;
    }

    return false;
  }

  const success = search(0);

  if (!success) {
    for (let i = applied.length - 1; i >= 0; i--) {
      const { pos, origChar } = applied[i];
      applyB64Change(b64Buf, streamData, stream.start, pos, origChar);
      b64Buf[pos] = origChar;
    }
    return { fixed: false, totalChanges: 0, desc: `DFS exhausted (${nodeCount} nodes)` };
  }

  return { fixed: true, totalChanges: applied.length, desc: `DFS ${applied.length} fixes` };
}

// ── Position-guided search (all confusion pairs, memory-efficient) ──

function positionGuidedSearch(
  b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  windowRadius: number = 60,
): { fixed: boolean; changes: number; desc: string } {
  const errPos = findCorruptionOffset(streamData);
  const errorPdfPos = stream.start + errPos;
  const b64Center = pdfToB64(errorPdfPos);

  interface Cand { pos: number; from: number; to: number }
  const candidates: Cand[] = [];
  const lo = Math.max(pdfToB64(stream.start), b64Center - windowRadius);
  const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + windowRadius);
  for (let i = lo; i < hi; i++) {
    const ch = b64Buf[i];
    const subs = SUBS_MAP.get(ch);
    if (subs) {
      for (const to of subs) candidates.push({ pos: i, from: ch, to });
    }
  }

  if (candidates.length === 0) {
    return { fixed: false, changes: 0, desc: `no candidates near byte ${errPos}` };
  }

  // Depth 1
  for (const c of candidates) {
    const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, c.pos, c.to);
    if (changed && canInflate(streamData)) {
      b64Buf[c.pos] = c.to;
      return { fixed: true, changes: 1, desc: `1x ${String.fromCharCode(c.from)}->${String.fromCharCode(c.to)} near byte ${errPos}` };
    }
    revertB64Change(streamData, stream.start, c.pos, oldBytes);
  }

  // Depth 2
  const maxDouble = Math.min(candidates.length, 40);
  for (let i = 0; i < maxDouble; i++) {
    const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
    b64Buf[candidates[i].pos] = candidates[i].to;
    for (let j = i + 1; j < maxDouble; j++) {
      if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
      const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
      if (r2.changed && canInflate(streamData)) {
        b64Buf[candidates[j].pos] = candidates[j].to;
        return { fixed: true, changes: 2, desc: `2x near byte ${errPos}` };
      }
      revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
    }
    b64Buf[candidates[i].pos] = candidates[i].from;
    revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
  }

  // Depth 3
  const maxTriple = Math.min(candidates.length, 25);
  for (let i = 0; i < maxTriple; i++) {
    const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
    b64Buf[candidates[i].pos] = candidates[i].to;
    for (let j = i + 1; j < maxTriple; j++) {
      if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
      const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
      b64Buf[candidates[j].pos] = candidates[j].to;
      for (let k = j + 1; k < maxTriple; k++) {
        if (b64Buf[candidates[k].pos] !== candidates[k].from) continue;
        const r3 = applyB64Change(b64Buf, streamData, stream.start, candidates[k].pos, candidates[k].to);
        if (r3.changed && canInflate(streamData)) {
          b64Buf[candidates[k].pos] = candidates[k].to;
          return { fixed: true, changes: 3, desc: `3x near byte ${errPos}` };
        }
        revertB64Change(streamData, stream.start, candidates[k].pos, r3.oldBytes);
      }
      b64Buf[candidates[j].pos] = candidates[j].from;
      revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
    }
    b64Buf[candidates[i].pos] = candidates[i].from;
    revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
  }

  return { fixed: false, changes: 0, desc: `no fix (${candidates.length} candidates near byte ${errPos}/${stream.size})` };
}

// ── Main ────────────────────────────────────────────────────────────

async function main() {
  const args = process.argv.slice(2);

  if (args.length < 2) {
    console.log("Usage: bun recover_pdf.ts <input.txt> <output.pdf>");
    process.exit(1);
  }

  const inputFile = args[0];
  const outputFile = args[1];

  console.log("=".repeat(64));
  console.log("  PDF Recovery Script");
  console.log("=".repeat(64));

  // ─── Step 1: Read and clean base64 ──────────────────────────────

  console.log(`\n[1/6] Reading ${inputFile}...`);
  const rawBase64 = readFileSync(inputFile, "utf-8").trim();
  console.log(`  ${rawBase64.length} characters (with whitespace)`);

  let base64Clean = rawBase64.replace(/\s+/g, '');
  console.log(`  ${base64Clean.length} base64 characters`);

  // ─── Step 2: Auto-discover and fix OCR errors ──────────────────

  console.log("\n[2/6] Auto-discovering and fixing OCR errors...");
  const autoResult = autoDiscoverAndFixOCRErrors(base64Clean);
  let base64Buf = autoResult.base64Buf;
  let pdf = autoResult.pdf;
  let pdfStr = autoResult.pdfStr;
  console.log(`  ${autoResult.fixCount} auto-discovered fixes applied`);

  // ─── Step 3: Analyze streams ────────────────────────────────────

  console.log("\n[3/6] Analyzing streams...");
  pdf = Buffer.from(base64Buf.toString(), 'base64');
  pdfStr = pdf.toString('binary');

  let allStreams = findAllStreams(pdfStr);
  let flatStreams = allStreams.filter(s => s.hasFilter);
  const nonFlatStreams = allStreams.filter(s => !s.hasFilter);
  let okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;

  console.log(`  Found ${allStreams.length} streams total`);
  console.log(`    ${flatStreams.length} FlateDecode (compressed), ${okCount} decompress OK`);
  console.log(`    ${nonFlatStreams.length} uncompressed (skipped)`);

  const pageContentObjs = findPageContentObjects(pdfStr);

  // ─── Step 4: Fix Adler-32 checksums ─────────────────────────────

  console.log("\n[4/6] Repairing Adler-32 checksums...");
  let checksumFixed = 0;

  for (const stream of flatStreams) {
    if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
    if (tryFixChecksum(pdf, stream)) {
      checksumFixed++;
      console.log(`  Object ${stream.obj}: checksum fixed`);
    }
  }

  if (checksumFixed > 0) {
    base64Buf = Buffer.from(pdf.toString('base64'));
    pdfStr = pdf.toString('binary');
    console.log(`  ${checksumFixed} streams fixed via checksum repair`);
  } else {
    console.log("  (no checksum-only errors found)");
  }

  // Refresh stream list
  allStreams = findAllStreams(pdfStr);
  flatStreams = allStreams.filter(s => s.hasFilter);
  okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;
  console.log(`  Status: ${okCount}/${flatStreams.length} compressed streams OK`);

  // ─── Step 5: Stream repair ──────────────────────────────────────

  console.log("\n[5/6] Stream repair...");

  const failedStreams = flatStreams
    .filter(s => !canInflate(pdf.subarray(s.start, s.end)))
    .sort((a, b) => {
      const aPage = pageContentObjs.includes(a.obj);
      const bPage = pageContentObjs.includes(b.obj);
      if (aPage && !bPage) return -1;
      if (!aPage && bPage) return 1;
      return a.size - b.size;
    });

  console.log(`  ${failedStreams.length} streams to repair\n`);

  let totalFixed = checksumFixed;

  for (const stream of failedStreams) {
    // Re-decode PDF to get current state (only once per stream, not per trial)
    pdf = Buffer.from(base64Buf.toString(), 'base64');
    pdfStr = pdf.toString('binary');
    const currentStream = findObjStream(pdfStr, stream.obj) ||
                          findAllStreams(pdfStr).find(s => s.obj === stream.obj);
    if (!currentStream) continue;

    // Extract stream data for in-place manipulation
    const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
    if (canInflate(streamData)) continue;

    const isPage = pageContentObjs.includes(stream.obj);
    const label = isPage ? '[PAGE]' : '      ';
    process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} ${label} (${currentStream.size}b): `);

    // Phase A: Position-guided search with confusion pairs (memory-efficient)
    const guided = positionGuidedSearch(base64Buf, streamData, currentStream);
    if (guided.fixed) {
      totalFixed++;
      console.log(`FIXED [guided] ${guided.desc}`);
      continue;
    }

    // Phase B: Brute-force l→1 (memory-efficient)
    const maxBfDepth = currentStream.size < 3000 ? 4 : 2;
    const bf = bruteForceL1(base64Buf, streamData, currentStream, maxBfDepth);
    if (bf.fixed) {
      totalFixed++;
      console.log(`FIXED [brute-force] ${bf.changes}x l->1 (depth ${bf.depth})`);
      continue;
    }

    // Phase C: Iterative error fixing (greedy, one error at a time)
    const maxIter = currentStream.size < 5000 ? 15 : 10;
    const window = currentStream.size < 5000 ? 300 : 200;
    const iterResult = iterativeErrorFix(base64Buf, streamData, currentStream, maxIter, window);
    if (iterResult.fixed) {
      totalFixed++;
      console.log(`FIXED [iterative] ${iterResult.desc}`);
      continue;
    }

    // Not fixed
    const err = getError(streamData);
    console.log(`not fixed (${err.substring(0, 45)}) ${iterResult.desc}`);
  }

  // ─── Step 5b: Post-repair checksum pass ─────────────────────────
  // Iterative fixes corrected data errors in b64Buf but checksum
  // patches were only applied to local streamData buffers. Re-decode
  // and repair checksums on all remaining failing streams.

  pdf = Buffer.from(base64Buf.toString(), 'base64');
  pdfStr = pdf.toString('binary');

  console.log("\n  Post-repair checksum pass...");
  let postChecksumFixed = 0;
  const refreshedStreams = findAllStreams(pdfStr).filter(s => s.hasFilter);
  for (const stream of refreshedStreams) {
    if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
    if (tryFixChecksum(pdf, stream)) {
      postChecksumFixed++;
      totalFixed++;
    }
  }
  if (postChecksumFixed > 0) {
    console.log(`  ${postChecksumFixed} additional streams fixed via checksum repair`);
    pdfStr = pdf.toString('binary');
  } else {
    console.log("  (no additional checksum fixes)");
  }

  // ─── Step 5c: Second pass with wider search for remaining ──────

  const stillFailing = findAllStreams(pdfStr)
    .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));

  if (stillFailing.length > 0) {
    // Re-sync base64Buf with the checksum-repaired pdf
    base64Buf = Buffer.from(pdf.toString('base64'));

    console.log(`\n  Second pass (wider search) for ${stillFailing.length} remaining streams...\n`);
    for (const stream of stillFailing) {
      pdf = Buffer.from(base64Buf.toString(), 'base64');
      pdfStr = pdf.toString('binary');
      const currentStream = findObjStream(pdfStr, stream.obj) ||
                            findAllStreams(pdfStr).find(s => s.obj === stream.obj);
      if (!currentStream) continue;

      const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
      if (canInflate(streamData)) continue;

      process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);

      if (currentStream.size > 15000) {
        console.log(`skipped (too large for DFS)`);
        continue;
      }

      // DFS with backtracking — explores alternative fixes when greedy gets stuck
      const dfsResult = dfsErrorSearch(base64Buf, streamData, currentStream, 12, 2, 300);
      if (dfsResult.fixed) {
        totalFixed++;
        console.log(`FIXED [DFS] ${dfsResult.desc}`);
      } else {
        console.log(`not fixed ${dfsResult.desc}`);
      }
    }

    // Final checksum repair for any new fixes from second pass
    pdf = Buffer.from(base64Buf.toString(), 'base64');
    pdfStr = pdf.toString('binary');
    for (const stream of findAllStreams(pdfStr).filter(s => s.hasFilter)) {
      if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
      if (tryFixChecksum(pdf, stream)) { totalFixed++; postChecksumFixed++; }
    }
    pdfStr = pdf.toString('binary');
  }

  // ─── Step 5d: Rust-accelerated repair for remaining streams ─────

  const rustFailing = findAllStreams(pdfStr)
    .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));

  if (rustFailing.length > 0) {
    const rustBin = new URL("stream_fixer/target/release/stream_fixer", import.meta.url).pathname;
    const hasTool = existsSync(rustBin);

    if (hasTool) {
      console.log(`\n  Rust-accelerated repair for ${rustFailing.length} remaining streams...\n`);

      // Write current b64 state to a temp file
      base64Buf = Buffer.from(pdf.toString('base64'));
      const tmpB64 = `/tmp/pdf_recover_b64_${process.pid}.txt`;
      writeFileSync(tmpB64, base64Buf);

      for (const stream of rustFailing) {
        const currentStream = findObjStream(pdfStr, stream.obj) ||
                              findAllStreams(pdfStr).find(s => s.obj === stream.obj);
        if (!currentStream) continue;

        process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);

        const result = spawnSync(rustBin, [
          tmpB64, String(currentStream.start), String(currentStream.end),
          "--max-iter", "30", "--max-nodes", "100000",
          "--window", "600", "--branch", "3", "--max-depth", "20"
        ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], timeout: 300000 });

        if (result.status !== 0) {
          console.log(`error (${(result.stderr || "").substring(0, 60)})`);
          continue;
        }

        try {
          const output = JSON.parse(result.stdout);
          if (output.fixed && output.changes && output.changes.length > 0) {
            // Apply the b64 changes
            for (const ch of output.changes) {
              const pos = ch.b64_pos;
              base64Buf[pos] = ch.new_char.charCodeAt(0);
            }
            // Re-decode PDF and update
            pdf = Buffer.from(base64Buf.toString(), 'base64');
            pdfStr = pdf.toString('binary');
            // Write updated b64 for next stream
            writeFileSync(tmpB64, base64Buf);

            // Checksum repair on this stream
            const fixed = findObjStream(pdfStr, stream.obj) ||
                          findAllStreams(pdfStr).find(s => s.obj === stream.obj);
            if (fixed && fixed.hasFilter && !canInflate(pdf.subarray(fixed.start, fixed.end))) {
              if (tryFixChecksum(pdf, fixed)) {
                pdfStr = pdf.toString('binary');
                base64Buf = Buffer.from(pdf.toString('base64'));
                writeFileSync(tmpB64, base64Buf);
              }
            }

            const nowOk = fixed && canInflate(pdf.subarray(fixed!.start, fixed!.end));
            totalFixed += nowOk ? 1 : 0;
            console.log(`${nowOk ? 'FIXED' : 'partial'} [Rust] ${output.desc}`);
          } else if (output.fixed) {
            console.log(`already OK [Rust] ${output.desc}`);
          } else {
            console.log(`not fixed [Rust] ${output.desc}`);
          }
        } catch (e) {
          console.log(`parse error: ${result.stdout.substring(0, 60)}`);
        }
      }

      // Clean up temp file
      try { unlinkSync(tmpB64); } catch {}
    } else {
      console.log(`\n  (Rust tool not found at ${rustBin} — skipping accelerated repair)`);
      console.log(`  Build with: cd stream_fixer && cargo build --release`);
    }
  }

  // ─── Step 5e: Stream-end (BFINAL) repair for remaining streams ──

  const bfinalFailing = findAllStreams(pdfStr)
    .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));

  if (bfinalFailing.length > 0) {
    console.log(`\n  Stream-end repair for ${bfinalFailing.length} remaining streams...\n`);
    for (const stream of bfinalFailing) {
      process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${stream.size}b): `);
      if (tryFixStreamEnd(pdf, stream)) {
        totalFixed++;
        pdfStr = pdf.toString('binary');
        console.log('FIXED [BFINAL]');
      } else {
        console.log('not fixed');
      }
    }
  }

  // ─── Step 5f: Visual repair (ObjStm alignment, color, content) ──

  console.log('\n  Visual repair...');
  pdfStr = fixObjStmAlignment(pdf, pdfStr);
  pdfStr = truncateGarbledContentStreams(pdf, pdfStr);
  pdfStr = replaceColorSpacesWithCMYK(pdf, pdfStr);
  console.log('  done.');

  // ─── Step 6: Final output ───────────────────────────────────────

  console.log("\n" + "=".repeat(64));
  console.log("  RESULTS");
  console.log("=".repeat(64));

  // Per-object detection for accurate results
  let finalOk = 0;
  let finalTotal = 0;
  const stillFailingList: { obj: number; size: number; err: string }[] = [];

  for (const origStream of flatStreams) {
    const s = findObjStream(pdfStr, origStream.obj) ||
              findAllStreams(pdfStr).find(x => x.obj === origStream.obj);
    if (!s || !s.hasFilter) continue;
    finalTotal++;
    const data = pdf.subarray(s.start, s.end);
    if (canInflate(data)) {
      finalOk++;
    } else {
      stillFailingList.push({ obj: s.obj, size: s.size, err: getError(data) });
    }
  }

  console.log(`\nCompressed streams: ${finalOk}/${finalTotal} OK`);
  console.log(`Total streams fixed: ${totalFixed}`);

  if (pageContentObjs.length > 0) {
    let pageOk = 0;
    console.log("\nPage content streams:");
    for (const objNum of pageContentObjs) {
      const s = findObjStream(pdfStr, objNum);
      if (s && s.hasFilter) {
        const ok = canInflate(pdf.subarray(s.start, s.end));
        if (ok) pageOk++;
        console.log(`  Object ${objNum}: ${ok ? 'OK' : 'FAIL'}`);
      }
    }
    console.log(`Page content: ${pageOk}/${pageContentObjs.length} OK`);
  }

  if (stillFailingList.length > 0) {
    console.log(`\nStill failing (${stillFailingList.length}):`);
    for (const s of stillFailingList) {
      console.log(`  Obj ${s.obj}: ${s.size}b - ${s.err.substring(0, 50)}`);
    }
  }

  // Save raw repaired PDF
  const rawFile = outputFile.replace(/\.pdf$/, "-raw.pdf");
  writeFileSync(rawFile, pdf);
  console.log(`\nSaved raw: ${rawFile}`);

  // Extract text and re-distill
  console.log("\n[6/6] Extracting text and re-distilling...");
  let rawText = extractText(rawFile);
  let rawLines = rawText.split('\n').filter((l: string) => l.trim());
  console.log(`  Raw PDF text: ${rawLines.length} non-empty lines`);

  if (runGhostscript(rawFile, outputFile)) {
    console.log(`  Saved: ${outputFile}`);
    const distilledText = extractText(outputFile);
    const distilledLines = distilledText.split('\n').filter((l: string) => l.trim());
    console.log(`  Re-distilled text: ${distilledLines.length} non-empty lines`);

    const bestText = distilledLines.length >= rawLines.length ? distilledText : rawText;
    const bestLines = bestText.split('\n').filter((l: string) => l.trim());
    const bestSource = distilledLines.length >= rawLines.length ? 'distilled' : 'raw';

    console.log(`\n--- Extracted Text (${bestSource}, first 40 lines) ---`);
    if (bestLines.length > 0) {
      console.log(bestLines.slice(0, 40).join('\n'));
      console.log(`\n(${bestLines.length} total non-empty lines)`);
    } else {
      console.log("(no text extracted)");
    }
  } else {
    console.log(`  Ghostscript re-distill failed, using raw PDF`);
    writeFileSync(outputFile, pdf);
  }

  // Keep raw file for debugging
  // if (existsSync(rawFile) && existsSync(outputFile) && rawFile !== outputFile) {
  //   unlinkSync(rawFile);
  // }
}

main().catch(console.error);