Guest User

Untitled

a guest
Feb 6th, 2026
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env bun
  2.  
  3. /**
  4.  * PDF Recovery Script
  5.  *
  6.  * Recovers a corrupted base64-encoded PDF that was OCR'd with errors.
  7.  * Techniques: text substitutions, Adler-32 checksum repair,
  8.  *   error-position-guided fuzzing with OCR confusion pairs,
  9.  *   brute-force l→1 substitution.
  10.  *
  11.  * Memory-efficient: modifies stream data bytes in-place instead of
  12.  * re-decoding the entire PDF for each trial.
  13.  *
  14.  * Usage: bun recover_pdf.ts <input.txt> <output.pdf>
  15.  */
  16.  
  17. import { readFileSync, writeFileSync, existsSync, unlinkSync } from "fs";
  18. import { inflateSync, inflateRawSync, deflateSync, constants } from "zlib";
  19. import { spawnSync } from "child_process";
  20.  
  21. // ── Utilities ───────────────────────────────────────────────────────
  22.  
  23. function commandExists(cmd: string): boolean {
  24.   return spawnSync("which", [cmd], { encoding: "utf-8" }).status === 0;
  25. }
  26.  
  27. function runGhostscript(inputPath: string, outputPath: string): boolean {
  28.   if (!commandExists("gs")) {
  29.     console.error("\nGhostscript (gs) not found. Install with: brew install ghostscript");
  30.     return false;
  31.   }
  32.   if (existsSync(outputPath)) unlinkSync(outputPath);
  33.   spawnSync("gs", [
  34.     "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
  35.     "-dNOPAUSE", "-dQUIET", "-dBATCH",
  36.     `-sOutputFile=${outputPath}`, inputPath
  37.   ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
  38.   return existsSync(outputPath);
  39. }
  40.  
  41. function extractText(pdfPath: string): string {
  42.   if (!commandExists("gs")) return "(gs not available)";
  43.   const result = spawnSync("gs", [
  44.     "-sDEVICE=txtwrite", "-sOutputFile=-",
  45.     "-dQUIET", "-dNOPAUSE", "-dBATCH", pdfPath
  46.   ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
  47.   return result.stdout || "";
  48. }
  49.  
  50. // ── PDF Token Dictionary ─────────────────────────────────────────────
  51. // Known valid tokens in PDF files for auto-discovering OCR corruption.
  52.  
  53. const PDF_TOKENS: Set<string> = new Set([
  54.   // Dictionary keys
  55.   "/Type", "/Subtype", "/Filter", "/Length", "/Width", "/Height",
  56.   "/Name", "/Names", "/Pages", "/Page", "/Root", "/Info", "/Size",
  57.   "/Resources", "/Contents", "/MediaBox", "/CropBox", "/BleedBox",
  58.   "/TrimBox", "/ArtBox", "/Font", "/XObject", "/ExtGState",
  59.   "/ColorSpace", "/ProcSet", "/Encoding", "/BaseFont", "/Metadata",
  60.   "/Predictor", "/Columns", "/Colors", "/DecodeParms", "/Decode",
  61.   "/Linearized", "/Parent", "/Kids", "/Count", "/Rotate",
  62.   "/Annots", "/Border", "/Rect", "/Dest", "/Action", "/URI", "/S",
  63.   "/FontDescriptor", "/FontName", "/FontFile", "/FontFile2",
  64.   "/FontFile3", "/Flags", "/ItalicAngle", "/StemV", "/StemH",
  65.   "/Ascent", "/Descent", "/CapHeight", "/XHeight", "/MissingWidth",
  66.   "/FontBBox", "/FirstChar", "/LastChar", "/Widths", "/AvgWidth",
  67.   "/MaxWidth", "/Leading", "/ToUnicode", "/CIDSystemInfo",
  68.   "/DescendantFonts", "/BaseEncoding", "/Differences", "/DW", "/W",
  69.   "/BitsPerComponent", "/ImageMask", "/Mask", "/SMask",
  70.   "/Interpolate", "/Intent", "/ColorTransform",
  71.   "/ID", "/Index", "/Prev", "/N", "/O", "/P", "/E", "/T", "/H", "/L",
  72.   "/Producer", "/Creator", "/CreationDate", "/ModDate", "/LastModified",
  73.   "/Title", "/Author", "/Subject", "/Keywords", "/Trapped",
  74.   "/GTS_PDFXVersion", "/OutputIntents", "/DestOutputProfile",
  75.   "/MarkInfo", "/StructTreeRoot", "/Lang",
  76.   "/ViewerPreferences", "/PageLayout", "/PageMode",
  77.   "/Outlines", "/Threads", "/OpenAction", "/AcroForm", "/Fields",
  78.   "/Prop_Build", "/Properties", "/Group", "/K", "/Pg",
  79.   "/BBox", "/Matrix", "/FormType", "/OC", "/OCGs", "/OCProperties",
  80.   "/Registry", "/Ordering", "/Supplement",
  81.   // Type values
  82.   "/Catalog", "/FontDescriptor", "/Image", "/Form",
  83.   "/Annot", "/Link", "/Text", "/Widget", "/ObjStm", "/XRef",
  84.   // Filter values
  85.   "/FlateDecode", "/DCTDecode", "/ASCII85Decode", "/ASCIIHexDecode",
  86.   "/LZWDecode", "/RunLengthDecode", "/CCITTFaxDecode",
  87.   "/JBIG2Decode", "/JPXDecode", "/Crypt",
  88.   // Color spaces
  89.   "/DeviceRGB", "/DeviceCMYK", "/DeviceGray", "/DeviceN",
  90.   "/ICCBased", "/Indexed", "/CalRGB", "/CalGray", "/Lab",
  91.   "/Pattern", "/Separation",
  92.   // ProcSet values
  93.   "/PDF", "/ImageB", "/ImageC", "/ImageI",
  94.   // XMP namespaces (bare, no leading /)
  95.   "xmpG", "xmpMM", "xmpTPg", "stRef", "stFnt", "stEvt",
  96.   "pdfx", "pdfaid",
  97.   // Common parameter values (bare)
  98.   "FlateDecode", "DCTDecode", "CMYK", "Name", "ProcSet",
  99.   "Predictor", "mode", "LastModified",
  100. ]);
  101.  
  102. // ── OCR Confusion Model ─────────────────────────────────────────────
  103. // Maps characters to what OCR commonly misreads them as (bidirectional).
  104.  
  105. const OCR_CONFUSIONS: Map<string, string[]> = new Map([
  106.   [".", ["/"]],
  107.   ["/", ["."]],
  108.   ["l", ["1", "I"]],
  109.   ["1", ["l", "I"]],
  110.   ["I", ["l", "1", "M"]],
  111.   ["O", ["0"]],
  112.   ["0", ["O"]],
  113.   ["m", ["i"]],
  114.   ["i", ["m"]],
  115.   ["M", ["I"]],
  116.   ["c", ["b"]],
  117.   ["b", ["c"]],
  118.   ["5", ["S", "%"]],
  119.   ["S", ["5"]],
  120.   ["%", ["5"]],
  121.   ["8", ["B"]],
  122.   ["B", ["8"]],
  123.   ["e", ["u"]],
  124.   ["u", ["e"]],
  125. ]);
  126.  
  127. // ── Auto-discovery of OCR errors ────────────────────────────────────
  128.  
  129. interface DiscoveredFix {
  130.   offset: number;
  131.   original: string;
  132.   replacement: string;
  133.   rule: string;
  134. }
  135.  
  136. /** Find plaintext regions (everything outside stream...endstream) */
  137. function findPlaintextRegions(pdfStr: string): { start: number; end: number }[] {
  138.   const regions: { start: number; end: number }[] = [];
  139.   const streamRe = /stream[\r\n][\s\S]*?endstream/g;
  140.   let lastEnd = 0;
  141.   let m;
  142.   while ((m = streamRe.exec(pdfStr)) !== null) {
  143.     if (m.index > lastEnd) regions.push({ start: lastEnd, end: m.index });
  144.     lastEnd = m.index + m[0].length;
  145.   }
  146.   if (lastEnd < pdfStr.length) regions.push({ start: lastEnd, end: pdfStr.length });
  147.   return regions;
  148. }
  149.  
  150. /** Try single-char OCR confusions to match a token against the dictionary */
  151. function tryOCRConfusions(token: string): string | null {
  152.   // 1-char substitutions
  153.   for (let i = 0; i < token.length; i++) {
  154.     const confusions = OCR_CONFUSIONS.get(token[i]);
  155.     if (!confusions) continue;
  156.     for (const sub of confusions) {
  157.       if (sub.length !== 1) continue;
  158.       const candidate = token.substring(0, i) + sub + token.substring(i + 1);
  159.       if (PDF_TOKENS.has(candidate)) return candidate;
  160.     }
  161.   }
  162.   // 2-char substitutions for longer tokens
  163.   if (token.length > 4) {
  164.     for (let i = 0; i < token.length; i++) {
  165.       const c1 = OCR_CONFUSIONS.get(token[i]);
  166.       if (!c1) continue;
  167.       for (const s1 of c1) {
  168.         if (s1.length !== 1) continue;
  169.         const partial = token.substring(0, i) + s1 + token.substring(i + 1);
  170.         for (let j = i + 1; j < partial.length; j++) {
  171.           const c2 = OCR_CONFUSIONS.get(partial[j]);
  172.           if (!c2) continue;
  173.           for (const s2 of c2) {
  174.             if (s2.length !== 1) continue;
  175.             const candidate = partial.substring(0, j) + s2 + partial.substring(j + 1);
  176.             if (PDF_TOKENS.has(candidate)) return candidate;
  177.           }
  178.         }
  179.       }
  180.     }
  181.   }
  182.   return null;
  183. }
  184.  
  185. /**
  186.  * Auto-discover and fix OCR errors in the decoded PDF using PDF spec knowledge.
  187.  * Replaces hardcoded BASE64_SUBSTITUTIONS and DECODED_FIXES.
  188.  */
  189. function autoDiscoverAndFixOCRErrors(base64Clean: string): {
  190.   base64Buf: Buffer; pdf: Buffer; pdfStr: string; fixCount: number;
  191. } {
  192.   let pdf = Buffer.from(base64Clean, 'base64');
  193.   let pdfStr = pdf.toString('binary');
  194.   const regions = findPlaintextRegions(pdfStr);
  195.   const fixes: DiscoveredFix[] = [];
  196.   const seen = new Set<number>();
  197.  
  198.   for (const region of regions) {
  199.     const text = pdfStr.substring(region.start, region.end);
  200.  
  201.     // Pattern 1: PDF names — /Word or .Word (dot may be corrupted slash)
  202.     const nameRe = /[./][A-Z][A-Za-z0-9_]*/g;
  203.     let m;
  204.     while ((m = nameRe.exec(text)) !== null) {
  205.       const token = m[0];
  206.       const absOff = region.start + m.index;
  207.       if (seen.has(absOff)) continue;
  208.  
  209.       if (token.startsWith(".")) {
  210.         // Try . → / directly
  211.         const slashed = "/" + token.substring(1);
  212.         if (PDF_TOKENS.has(slashed)) {
  213.           fixes.push({ offset: absOff, original: token, replacement: slashed,
  214.             rule: `. -> / (${slashed})` });
  215.           seen.add(absOff);
  216.           continue;
  217.         }
  218.         // Try . → / plus further OCR fix
  219.         const further = tryOCRConfusions(slashed);
  220.         if (further) {
  221.           fixes.push({ offset: absOff, original: token, replacement: further,
  222.             rule: `. -> / + OCR fix (${further})` });
  223.           seen.add(absOff);
  224.           continue;
  225.         }
  226.       }
  227.  
  228.       if (token.startsWith("/") && !PDF_TOKENS.has(token)) {
  229.         const fixed = tryOCRConfusions(token);
  230.         if (fixed) {
  231.           fixes.push({ offset: absOff, original: token, replacement: fixed,
  232.             rule: `OCR fix (${fixed})` });
  233.           seen.add(absOff);
  234.         }
  235.       }
  236.     }
  237.  
  238.     // Pattern 2: Bare words (XMP namespaces, parameter values)
  239.     const wordRe = /(?<![/.\w])[a-zA-Z][a-zA-Z]{2,20}(?![a-zA-Z])/g;
  240.     while ((m = wordRe.exec(text)) !== null) {
  241.       const token = m[0];
  242.       const absOff = region.start + m.index;
  243.       if (seen.has(absOff) || PDF_TOKENS.has(token)) continue;
  244.       const fixed = tryOCRConfusions(token);
  245.       if (fixed) {
  246.         fixes.push({ offset: absOff, original: token, replacement: fixed,
  247.           rule: `bare word OCR fix (${fixed})` });
  248.         seen.add(absOff);
  249.       }
  250.     }
  251.  
  252.     // Pattern 3: Structural . → / after >> or ] or digits
  253.     const structRe = /(>>|]|\d)\.([\s\r\n/A-Z])/g;
  254.     while ((m = structRe.exec(text)) !== null) {
  255.       const dotOff = region.start + m.index + m[1].length;
  256.       if (seen.has(dotOff)) continue;
  257.       fixes.push({ offset: dotOff, original: ".", replacement: "/",
  258.         rule: `. -> / after "${m[1]}"` });
  259.       seen.add(dotOff);
  260.     }
  261.  
  262.     // Pattern 4: % in hex context (likely misread 5)
  263.     const hexRe = /%([0-9A-Fa-f]{4,})/g;
  264.     while ((m = hexRe.exec(text)) !== null) {
  265.       const absOff = region.start + m.index;
  266.       if (seen.has(absOff)) continue;
  267.       fixes.push({ offset: absOff, original: "%", replacement: "5",
  268.         rule: `% -> 5 in hex context` });
  269.       seen.add(absOff);
  270.     }
  271.   }
  272.  
  273.   // Apply fixes (reverse offset order to preserve positions)
  274.   fixes.sort((a, b) => b.offset - a.offset);
  275.   for (const fix of fixes) {
  276.     pdfStr = pdfStr.substring(0, fix.offset) + fix.replacement +
  277.              pdfStr.substring(fix.offset + fix.original.length);
  278.   }
  279.  
  280.   // Log discoveries (sorted by offset for readability)
  281.   const sorted = [...fixes].sort((a, b) => a.offset - b.offset);
  282.   for (const fix of sorted) {
  283.     console.log(`    "${fix.original}" -> "${fix.replacement}" [${fix.rule}]`);
  284.   }
  285.  
  286.   if (fixes.length > 0) {
  287.     pdf = Buffer.from(pdfStr, 'binary');
  288.   }
  289.   const base64Buf = Buffer.from(pdf.toString('base64'));
  290.   return { base64Buf, pdf, pdfStr, fixCount: fixes.length };
  291. }
  292.  
  293. // ── Base64 helpers ──────────────────────────────────────────────────
  294.  
  295. const B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  296. const B64_VAL = new Uint8Array(128);
  297. for (let i = 0; i < 64; i++) B64_VAL[B64_CHARS.charCodeAt(i)] = i;
  298.  
  299. const L_BYTE = 'l'.charCodeAt(0);
  300. const ONE_BYTE = '1'.charCodeAt(0);
  301.  
  302. function pdfToB64(pdfPos: number): number {
  303.   return Math.floor(pdfPos / 3) * 4;
  304. }
  305.  
  306. /** Decode one base64 group (4 chars → 3 bytes) */
  307. function decodeGroup(b64Buf: Buffer, groupStart: number): [number, number, number] {
  308.   const a = B64_VAL[b64Buf[groupStart]];
  309.   const b = B64_VAL[b64Buf[groupStart + 1]];
  310.   const c = B64_VAL[b64Buf[groupStart + 2]];
  311.   const d = B64_VAL[b64Buf[groupStart + 3]];
  312.   return [(a << 2) | (b >> 4), ((b & 0xF) << 4) | (c >> 2), ((c & 3) << 6) | d];
  313. }
  314.  
  315. /**
  316.  * Apply a base64 char change to a stream data buffer IN PLACE.
  317.  * Returns the old bytes so the change can be reverted.
  318.  * Only modifies bytes that fall within the stream data range.
  319.  */
  320. function applyB64Change(
  321.   b64Buf: Buffer, streamData: Buffer, streamStart: number,
  322.   b64Pos: number, newChar: number
  323. ): { oldBytes: [number, number, number]; changed: boolean } {
  324.   const orig = b64Buf[b64Pos];
  325.   const groupStart = Math.floor(b64Pos / 4) * 4;
  326.   const pdfByteStart = (groupStart / 4) * 3;
  327.  
  328.   // Get old decoded bytes for this group
  329.   const old: [number, number, number] = [...decodeGroup(b64Buf, groupStart)] as [number, number, number];
  330.  
  331.   // Temporarily apply change and decode
  332.   b64Buf[b64Pos] = newChar;
  333.   const [n0, n1, n2] = decodeGroup(b64Buf, groupStart);
  334.   b64Buf[b64Pos] = orig;
  335.  
  336.   let changed = false;
  337.   const offsets = [pdfByteStart - streamStart, pdfByteStart - streamStart + 1, pdfByteStart - streamStart + 2];
  338.   const newBytes = [n0, n1, n2];
  339.  
  340.   for (let k = 0; k < 3; k++) {
  341.     if (offsets[k] >= 0 && offsets[k] < streamData.length && newBytes[k] !== streamData[offsets[k]]) {
  342.       streamData[offsets[k]] = newBytes[k];
  343.       changed = true;
  344.     }
  345.   }
  346.  
  347.   // Return the old values for the 3 positions (for reverting)
  348.   return { oldBytes: old, changed };
  349. }
  350.  
  351. /** Revert a base64 change on stream data */
  352. function revertB64Change(
  353.   streamData: Buffer, streamStart: number,
  354.   b64Pos: number, oldBytes: [number, number, number]
  355. ): void {
  356.   const groupStart = Math.floor(b64Pos / 4) * 4;
  357.   const pdfByteStart = (groupStart / 4) * 3;
  358.   for (let k = 0; k < 3; k++) {
  359.     const off = pdfByteStart - streamStart + k;
  360.     if (off >= 0 && off < streamData.length) {
  361.       streamData[off] = oldBytes[k];
  362.     }
  363.   }
  364. }
  365.  
  366. // ── OCR confusion pairs ─────────────────────────────────────────────
  367.  
  368. const CONFUSION_PAIRS: [number, number][] = [
  369.   [0x6C, 0x31], [0x31, 0x6C], // l ↔ 1
  370.   [0x4F, 0x30], [0x30, 0x4F], // O ↔ 0
  371.   [0x49, 0x6C], [0x6C, 0x49], // I ↔ l
  372.   [0x49, 0x31], [0x31, 0x49], // I ↔ 1
  373.   [0x35, 0x53], [0x53, 0x35], // 5 ↔ S
  374.   [0x38, 0x42], [0x42, 0x38], // 8 ↔ B
  375. ];
  376.  
  377. const SUBS_MAP = new Map<number, number[]>();
  378. for (const [from, to] of CONFUSION_PAIRS) {
  379.   if (!SUBS_MAP.has(from)) SUBS_MAP.set(from, []);
  380.   const arr = SUBS_MAP.get(from)!;
  381.   if (!arr.includes(to)) arr.push(to);
  382. }
  383.  
  384. // ── Types ───────────────────────────────────────────────────────────
  385.  
  386. interface StreamInfo {
  387.   obj: number;
  388.   start: number;
  389.   end: number;
  390.   size: number;
  391.   hasFilter: boolean;
  392. }
  393.  
  394. // ── Stream detection ────────────────────────────────────────────────
  395.  
  396. function findAllStreams(pdfStr: string): StreamInfo[] {
  397.   const streams: StreamInfo[] = [];
  398.   const re = />>\s*stream[\r\n]+/g;
  399.   let m;
  400.   while ((m = re.exec(pdfStr)) !== null) {
  401.     const dataStart = m.index + m[0].length;
  402.     const lookback = pdfStr.substring(Math.max(0, m.index - 2000), m.index);
  403.     const lastObj = lookback.lastIndexOf(' 0 obj');
  404.     if (lastObj === -1) continue;
  405.     const lastEndobj = lookback.lastIndexOf('endobj');
  406.     if (lastEndobj > lastObj) continue;
  407.     const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
  408.     const numMatch = beforeObj.match(/(\d+)\s*$/);
  409.     if (!numMatch) continue;
  410.     const objNum = parseInt(numMatch[1]);
  411.     const dictText = lookback.substring(lastObj);
  412.     const end = pdfStr.indexOf('endstream', dataStart);
  413.     if (end === -1) continue;
  414.     const hasFilter = /FlateDecode/.test(dictText);
  415.     streams.push({ obj: objNum, start: dataStart, end, size: end - dataStart, hasFilter });
  416.   }
  417.   return streams;
  418. }
  419.  
  420. function findObjStream(pdfStr: string, objNum: number): StreamInfo | null {
  421.   const pat = new RegExp('(?:^|[\\r\\n])' + objNum + '\\s+0\\s+obj', 'g');
  422.   let om;
  423.   while ((om = pat.exec(pdfStr)) !== null) {
  424.     const objStart = om.index;
  425.     const after = pdfStr.substring(objStart, objStart + 50000);
  426.     const sm = after.match(/>>\s*stream[\r\n]+/);
  427.     if (!sm) continue;
  428.     const between = after.substring(0, sm.index! + sm[0].length);
  429.     if (/endobj/.test(between)) continue;
  430.     const start = objStart + sm.index! + sm[0].length;
  431.     const end = pdfStr.indexOf('endstream', start);
  432.     if (end === -1) continue;
  433.     const hasFilter = /FlateDecode/.test(between);
  434.     return { obj: objNum, start, end, size: end - start, hasFilter };
  435.   }
  436.   return null;
  437. }
  438.  
  439. function findPageContentObjects(pdfStr: string): number[] {
  440.   const contents: number[] = [];
  441.   const re = /\/Type\s*\/Page\b(?!s)([\s\S]*?)(?=endobj)/g;
  442.   let pm;
  443.   while ((pm = re.exec(pdfStr)) !== null) {
  444.     const chunk = pm[0] + pm[1];
  445.     const single = chunk.match(/\/Contents\s+(\d+)\s+0\s+R/);
  446.     if (single) contents.push(parseInt(single[1]));
  447.     const arr = chunk.match(/\/Contents\s*\[([\d\s\nR]+)\]/);
  448.     if (arr) {
  449.       for (const ref of arr[1].matchAll(/(\d+)\s+0\s+R/g)) {
  450.         contents.push(parseInt(ref[1]));
  451.       }
  452.     }
  453.   }
  454.   return [...new Set(contents)];
  455. }
  456.  
  457. // ── Decompression helpers ───────────────────────────────────────────
  458.  
  459. function canInflate(data: Buffer): boolean {
  460.   try { inflateSync(data); return true; } catch { return false; }
  461. }
  462.  
  463. function getError(data: Buffer): string {
  464.   try { inflateSync(data); return "OK"; }
  465.   catch (e: any) { return String(e.message || e).replace(/^Error.*?:\s*/, ''); }
  466. }
  467.  
  468. function findCorruptionOffset(streamData: Buffer): number {
  469.   if (streamData.length < 3) return 0;
  470.   const rawData = streamData.subarray(2);
  471.   let lo = 0, hi = rawData.length;
  472.   while (lo < hi - 1) {
  473.     const mid = Math.floor((lo + hi) / 2);
  474.     try {
  475.       inflateRawSync(rawData.subarray(0, mid), { finishFlush: constants.Z_SYNC_FLUSH });
  476.       lo = mid;
  477.     } catch (e: any) {
  478.       const msg = String(e.message || '');
  479.       if (msg.includes('unexpected end') || msg.includes('buffer error') ||
  480.           msg.includes('incomplete') || msg.includes('need dictionary')) {
  481.         lo = mid;
  482.       } else { hi = mid; }
  483.     }
  484.   }
  485.   return lo + 2;
  486. }
  487.  
  488. // ── Adler-32 checksum repair ────────────────────────────────────────
  489.  
  490. function computeAdler32(data: Buffer): number {
  491.   let a = 1, b = 0;
  492.   const MOD = 65521;
  493.   for (let i = 0; i < data.length; i++) {
  494.     a = (a + data[i]) % MOD;
  495.     b = (b + a) % MOD;
  496.   }
  497.   return ((b << 16) | a) >>> 0;
  498. }
  499.  
  500. function tryFixChecksum(pdf: Buffer, stream: StreamInfo): boolean {
  501.   if (!stream.hasFilter || stream.size < 6) return false;
  502.  
  503.   // Strategy 1: Patch near the end (works when zlib data fills the stream)
  504.   for (const trim of [2, 1, 0]) {
  505.     const actualEnd = stream.end - trim;
  506.     const streamData = pdf.subarray(stream.start, actualEnd);
  507.     if (streamData.length < 6) continue;
  508.     let decompressed: Buffer;
  509.     try { decompressed = inflateRawSync(streamData.subarray(2)); } catch { continue; }
  510.     const adler = computeAdler32(decompressed);
  511.     const pos = actualEnd - 4;
  512.     const old = [pdf[pos], pdf[pos+1], pdf[pos+2], pdf[pos+3]];
  513.     pdf[pos]     = (adler >>> 24) & 0xFF;
  514.     pdf[pos + 1] = (adler >>> 16) & 0xFF;
  515.     pdf[pos + 2] = (adler >>>  8) & 0xFF;
  516.     pdf[pos + 3] =  adler         & 0xFF;
  517.     if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
  518.     // Revert
  519.     pdf[pos] = old[0]; pdf[pos+1] = old[1]; pdf[pos+2] = old[2]; pdf[pos+3] = old[3];
  520.   }
  521.  
  522.   // Strategy 2: Find actual deflate data end (handles trailing data after checksum)
  523.   const raw = pdf.subarray(stream.start + 2, stream.end);
  524.   let decompressed: Buffer;
  525.   try { decompressed = inflateRawSync(raw); } catch { return false; }
  526.   // Binary search for minimum raw bytes needed (= deflate data length)
  527.   let lo = 1, hi = raw.length;
  528.   while (lo + 1 < hi) {
  529.     const mid = Math.floor((lo + hi) / 2);
  530.     try { inflateRawSync(raw.subarray(0, mid)); hi = mid; } catch { lo = mid; }
  531.   }
  532.   const deflateLen = hi;
  533.   const adler = computeAdler32(decompressed);
  534.   const checksumPos = stream.start + 2 + deflateLen;
  535.   if (checksumPos + 4 > stream.end) return false;
  536.   const old = [pdf[checksumPos], pdf[checksumPos+1], pdf[checksumPos+2], pdf[checksumPos+3]];
  537.   pdf[checksumPos]     = (adler >>> 24) & 0xFF;
  538.   pdf[checksumPos + 1] = (adler >>> 16) & 0xFF;
  539.   pdf[checksumPos + 2] = (adler >>>  8) & 0xFF;
  540.   pdf[checksumPos + 3] =  adler         & 0xFF;
  541.   if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
  542.   // Revert
  543.   pdf[checksumPos] = old[0]; pdf[checksumPos+1] = old[1]; pdf[checksumPos+2] = old[2]; pdf[checksumPos+3] = old[3];
  544.   return false;
  545. }
  546.  
  547. /**
  548.  * Fix streams where BFINAL bit is missing/corrupted.
  549.  * Detects via: strict inflateRawSync fails with "unexpected end of file"
  550.  * but Z_SYNC_FLUSH mode succeeds. Then brute-forces 1-2 byte changes
  551.  * near the stream end to restore the BFINAL marker, followed by
  552.  * Adler-32 checksum repair.
  553.  */
  554. function tryFixStreamEnd(pdf: Buffer, stream: StreamInfo): boolean {
  555.   if (!stream.hasFilter || stream.size < 6) return false;
  556.  
  557.   const rawOrig = pdf.subarray(stream.start + 2, stream.end);
  558.   // Only attempt if strict inflate fails with "unexpected end"
  559.   try { inflateRawSync(rawOrig); return false; } catch (e: any) {
  560.     if (!String(e.message || '').includes('unexpected end')) return false;
  561.   }
  562.   // Verify Z_SYNC_FLUSH works (data valid but BFINAL missing)
  563.   try { inflateRawSync(rawOrig, { finishFlush: constants.Z_SYNC_FLUSH }); }
  564.   catch { return false; }
  565.  
  566.   const raw = Buffer.from(rawOrig); // work on a copy
  567.   const searchRange = Math.min(50, raw.length);
  568.   const startIdx = Math.max(0, raw.length - searchRange);
  569.  
  570.   for (let i = startIdx; i < raw.length; i++) {
  571.     const oi = raw[i];
  572.     for (let v = 0; v < 256; v++) {
  573.       if (v === oi) continue;
  574.       raw[i] = v;
  575.       let ok = false;
  576.       try { inflateRawSync(raw); ok = true; } catch (e: any) {
  577.         if (!String(e.message || '').includes('unexpected end')) {
  578.           // Error shifted — search ±5 for second byte fix
  579.           for (let j = Math.max(0, i - 5); j <= Math.min(raw.length - 1, i + 5); j++) {
  580.             if (j === i) continue;
  581.             const oj = raw[j];
  582.             for (let w = 0; w < 256; w++) {
  583.               if (w === oj) continue;
  584.               raw[j] = w;
  585.               try { inflateRawSync(raw); ok = true; } catch {}
  586.               if (ok) {
  587.                 // Apply both changes to pdf, then fix checksum
  588.                 pdf[stream.start + 2 + i] = v;
  589.                 pdf[stream.start + 2 + j] = w;
  590.                 if (tryFixChecksum(pdf, stream)) return true;
  591.                 // Revert if checksum repair failed
  592.                 pdf[stream.start + 2 + i] = oi;
  593.                 pdf[stream.start + 2 + j] = oj;
  594.                 ok = false;
  595.               }
  596.             }
  597.             raw[j] = oj;
  598.           }
  599.         }
  600.       }
  601.       if (ok) {
  602.         // Single-byte fix succeeded
  603.         pdf[stream.start + 2 + i] = v;
  604.         if (tryFixChecksum(pdf, stream)) return true;
  605.         pdf[stream.start + 2 + i] = oi; // revert
  606.       }
  607.       raw[i] = oi;
  608.     }
  609.   }
  610.  
  611.   return false;
  612. }
  613.  
  614. // ── Visual repair: ObjStm alignment, color space, content truncation ─
  615.  
  616. /**
  617.  * Fix ObjStm /First header alignment. If the header byte count doesn't
  618.  * match the /First value in the dictionary, pad with spaces so GS can
  619.  * find embedded objects.
  620.  */
  621. function fixObjStmAlignment(pdf: Buffer, pdfStr: string): string {
  622.   const re = />>\s*stream[\r\n]+/g;
  623.   let m;
  624.   while ((m = re.exec(pdfStr)) !== null) {
  625.     const start = m.index + m[0].length;
  626.     const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
  627.     const lastObj = lookback.lastIndexOf(' 0 obj');
  628.     if (lastObj === -1) continue;
  629.     const dictText = lookback.substring(lastObj + 6);
  630.     if (!/\/ObjStm/.test(dictText)) continue;
  631.  
  632.     const end = pdfStr.indexOf('endstream', start);
  633.     if (end === -1) continue;
  634.     const data = pdf.subarray(start, end);
  635.     let dec: Buffer;
  636.     try { dec = inflateSync(data); } catch { continue; }
  637.     const text = dec.toString('utf-8');
  638.  
  639.     const firstMatch = dictText.match(/\/First\s+(\d+)/);
  640.     if (!firstMatch) continue;
  641.     const firstValue = parseInt(firstMatch[1]);
  642.  
  643.     const headerMatch = text.match(/^([\d\s]+)/);
  644.     if (!headerMatch) continue;
  645.     const currentLen = headerMatch[0].length;
  646.  
  647.     if (currentLen === firstValue) continue;
  648.  
  649.     // Pad header to match /First
  650.     const nums = headerMatch[1].trim().split(/\s+/).map(Number);
  651.     let newHeader = '';
  652.     for (let i = 0; i < nums.length; i += 2) {
  653.       newHeader += nums[i] + ' ' + nums[i + 1] + ' ';
  654.     }
  655.     while (newHeader.length < firstValue) newHeader += ' ';
  656.     const objData = text.substring(currentLen);
  657.     const newContent = newHeader + objData;
  658.  
  659.     const compressed = deflateSync(Buffer.from(newContent));
  660.     if (compressed.length > end - start) continue;
  661.  
  662.     compressed.copy(pdf, start);
  663.     for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
  664.     updateStreamLength(pdf, pdfStr, start, compressed.length);
  665.     pdfStr = pdf.toString('binary');
  666.   }
  667.   return pdfStr;
  668. }
  669.  
  670. /**
  671.  * Auto-detect Separation color space → CMYK mappings from the PDF's
  672.  * color space definition objects, then replace /CSn cs T scn operators
  673.  * in content streams with direct CMYK k operators.
  674.  *
  675.  * For FunctionType 2 (exponential interpolation):
  676.  *   CMYK = C0 + tint^N * (C1 - C0)
  677.  */
  678. function replaceColorSpacesWithCMYK(pdf: Buffer, pdfStr: string): string {
  679.   // Step 1: Find /ColorSpace<<...>> in page Resource dictionaries
  680.   // Maps CS name (e.g. "CS0") → object number
  681.   const csRefs = new Map<string, number>();
  682.   const csRefRe = /\/ColorSpace\s*<<([^>]+)>>/g;
  683.   let csm;
  684.   while ((csm = csRefRe.exec(pdfStr)) !== null) {
  685.     const entries = csm[1].matchAll(/\/(CS\d+)\s+(\d+)\s+0\s+R/g);
  686.     for (const e of entries) csRefs.set(e[1], parseInt(e[2]));
  687.   }
  688.   if (csRefs.size === 0) return pdfStr;
  689.  
  690.   // Step 2: Resolve each CS object to get C0, C1, N from the Separation definition
  691.   // csMap: "CS0" → "1 0.57 0 0.38" (CMYK string for tint=1)
  692.   const csMap = new Map<string, string>();
  693.   for (const [csName, objNum] of csRefs) {
  694.     const csContent = resolveObject(pdf, pdfStr, objNum);
  695.     if (!csContent) continue;
  696.     // Parse: [/Separation /name /DeviceCMYK << /C0[...] /C1[...] /N n ... >>]
  697.     const c0Match = csContent.match(/\/C0\s*\[([^\]]+)\]/);
  698.     const c1Match = csContent.match(/\/C1\s*\[([^\]]+)\]/);
  699.     const nMatch = csContent.match(/\/N\s+([\d.]+)/);
  700.     if (!c0Match || !c1Match) continue;
  701.     const c0 = c0Match[1].trim().split(/\s+/).map(Number);
  702.     const c1 = c1Match[1].trim().split(/\s+/).map(Number);
  703.     const n = nMatch ? parseFloat(nMatch[1]) : 1;
  704.     if (c0.length !== 4 || c1.length !== 4) continue;
  705.     if (c0.some(isNaN) || c1.some(isNaN)) continue;
  706.     // Compute CMYK at tint=1: C0 + 1^N * (C1 - C0) = C1
  707.     const cmyk = c1.map((v, i) => {
  708.       const val = c0[i] + Math.pow(1, n) * (v - c0[i]);
  709.       return Number(val.toFixed(4)).toString();
  710.     });
  711.     csMap.set(csName, cmyk.join(' '));
  712.     console.log(`    Color space /${csName} (obj ${objNum}): → CMYK(${cmyk.join(', ')})`);
  713.   }
  714.   if (csMap.size === 0) return pdfStr;
  715.  
  716.   // Step 3: Replace /CSn cs T scn → CMYK k in content streams
  717.   const streamRe = />>\s*stream[\r\n]+/g;
  718.   let m;
  719.   while ((m = streamRe.exec(pdfStr)) !== null) {
  720.     const start = m.index + m[0].length;
  721.     const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
  722.     const lastObj = lookback.lastIndexOf(' 0 obj');
  723.     if (lastObj === -1) continue;
  724.     const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
  725.     const numMatch = beforeObj.match(/(\d+)\s*$/);
  726.     if (!numMatch) continue;
  727.  
  728.     const end = pdfStr.indexOf('endstream', start);
  729.     if (end === -1) continue;
  730.     const data = pdf.subarray(start, end);
  731.     let dec: Buffer;
  732.     try { dec = inflateSync(data); } catch { continue; }
  733.     let text = dec.toString('utf-8');
  734.  
  735.     const orig = text;
  736.     for (const [csName, cmyk] of csMap) {
  737.       // Match /<CSname> cs <tint> scn (with flexible whitespace)
  738.       const pat = new RegExp(`\\/${csName}\\s+cs\\s+([\\d.]+)\\s+scn`, 'g');
  739.       text = text.replace(pat, (_: string, tint: string) => {
  740.         // For tint != 1, would need to recompute, but tint=1 is by far most common
  741.         if (parseFloat(tint) === 1) return `${cmyk} k`;
  742.         // For other tint values, leave unchanged (rare)
  743.         return _;
  744.       });
  745.     }
  746.     if (text === orig) continue;
  747.  
  748.     const compressed = deflateSync(Buffer.from(text));
  749.     if (compressed.length > end - start) continue;
  750.     compressed.copy(pdf, start);
  751.     for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
  752.     updateStreamLength(pdf, pdfStr, start, compressed.length);
  753.     pdfStr = pdf.toString('binary');
  754.   }
  755.   return pdfStr;
  756. }
  757.  
  758. /**
  759.  * Resolve an object by number — check ObjStm streams first, then standalone objects.
  760.  * Returns the object's text content, or null if not found/unreadable.
  761.  */
  762. function resolveObject(pdf: Buffer, pdfStr: string, objNum: number): string | null {
  763.   // Try standalone object first
  764.   const standaloneRe = new RegExp(`(?:^|[\\r\\n])${objNum}\\s+0\\s+obj\\b`);
  765.   const sm = standaloneRe.exec(pdfStr);
  766.   if (sm) {
  767.     const objStart = sm.index + sm[0].length;
  768.     const objEnd = pdfStr.indexOf('endobj', objStart);
  769.     if (objEnd !== -1) return pdfStr.substring(objStart, objEnd).trim();
  770.   }
  771.  
  772.   // Search ObjStm streams
  773.   const re = />>\s*stream[\r\n]+/g;
  774.   let m;
  775.   while ((m = re.exec(pdfStr)) !== null) {
  776.     const start = m.index + m[0].length;
  777.     const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
  778.     const lastObj = lookback.lastIndexOf(' 0 obj');
  779.     if (lastObj === -1) continue;
  780.     const dictText = lookback.substring(lastObj + 6);
  781.     if (!/\/ObjStm/.test(dictText)) continue;
  782.     const firstMatch = dictText.match(/\/First\s+(\d+)/);
  783.     if (!firstMatch) continue;
  784.     const first = parseInt(firstMatch[1]);
  785.     const end = pdfStr.indexOf('endstream', start);
  786.     if (end === -1) continue;
  787.     const data = pdf.subarray(start, end);
  788.     let dec: Buffer;
  789.     try { dec = inflateSync(data); } catch { continue; }
  790.     const text = dec.toString('utf-8');
  791.     const headerMatch = text.match(/^([\d\s]+)/);
  792.     if (!headerMatch) continue;
  793.     const nums = headerMatch[1].trim().split(/\s+/).map(Number);
  794.     for (let i = 0; i < nums.length; i += 2) {
  795.       if (nums[i] !== objNum) continue;
  796.       const offset = nums[i + 1];
  797.       const nextOffset = i + 2 < nums.length ? nums[i + 3] : text.length - first;
  798.       return text.substring(first + offset, first + nextOffset).trim();
  799.     }
  800.   }
  801.   return null;
  802. }
  803.  
  804. /**
  805.  * Detect and truncate garbled content in decompressed page content streams.
  806.  * OCR errors can produce wrong deflate output that inflates OK but contains
  807.  * invalid PDF operators, causing accidental fills/clips.
  808.  */
  809. function truncateGarbledContentStreams(pdf: Buffer, pdfStr: string): string {
  810.   const re = />>\s*stream[\r\n]+/g;
  811.   let m;
  812.   while ((m = re.exec(pdfStr)) !== null) {
  813.     const start = m.index + m[0].length;
  814.     const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
  815.     const lastObj = lookback.lastIndexOf(' 0 obj');
  816.     if (lastObj === -1) continue;
  817.     const dictText = lookback.substring(lastObj + 6);
  818.     // Only process content streams (FlateDecode, not ObjStm/XRef)
  819.     if (!/FlateDecode/.test(dictText)) continue;
  820.     if (/\/ObjStm|\/XRef/.test(dictText)) continue;
  821.  
  822.     const end = pdfStr.indexOf('endstream', start);
  823.     if (end === -1) continue;
  824.     const data = pdf.subarray(start, end);
  825.     let dec: Buffer;
  826.     try { dec = inflateSync(data); } catch { continue; }
  827.     const text = dec.toString('utf-8');
  828.     const lines = text.split('\n');
  829.  
  830.     // Find first garbled line
  831.     let firstBad = -1;
  832.     for (let i = 0; i < lines.length; i++) {
  833.       const line = lines[i].trim();
  834.       if (line.length === 0) continue;
  835.       if (isGarbledLine(line)) { firstBad = i; break; }
  836.     }
  837.     if (firstBad === -1) continue;
  838.  
  839.     // Track BT/q state in good portion
  840.     let btDepth = 0, qDepth = 0;
  841.     for (let i = 0; i < firstBad; i++) {
  842.       const line = lines[i].trim();
  843.       const bt = line.match(/\bBT\b/g), et = line.match(/\bET\b/g);
  844.       const q = line.match(/\bq\b/g), Q = line.match(/\bQ\b/g);
  845.       if (bt) btDepth += bt.length;
  846.       if (et) btDepth -= et.length;
  847.       if (q) qDepth += q.length;
  848.       if (Q) qDepth -= Q.length;
  849.     }
  850.  
  851.     let truncated = lines.slice(0, firstBad).join('\n') + '\n';
  852.     if (btDepth > 0) truncated += 'ET\n';
  853.     while (qDepth > 0) { truncated += 'Q\n'; qDepth--; }
  854.  
  855.     const compressed = deflateSync(Buffer.from(truncated));
  856.     if (compressed.length > end - start) continue;
  857.     compressed.copy(pdf, start);
  858.     for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
  859.     updateStreamLength(pdf, pdfStr, start, compressed.length);
  860.     pdfStr = pdf.toString('binary');
  861.   }
  862.   return pdfStr;
  863. }
  864.  
  865. function isGarbledLine(line: string): boolean {
  866.   // Non-printable chars outside parenthesized strings
  867.   let inStr = false, depth = 0, bad = 0;
  868.   for (let j = 0; j < line.length; j++) {
  869.     const c = line.charCodeAt(j);
  870.     if (line[j] === '(' && !inStr) { inStr = true; depth++; }
  871.     else if (line[j] === '(' && inStr) depth++;
  872.     else if (line[j] === ')' && inStr) { depth--; if (depth === 0) inStr = false; }
  873.     if (!inStr && (c < 32 || c > 126) && c !== 9) bad++;
  874.   }
  875.   if (bad > 0) return true;
  876.   // Operators merged with numbers: Q703, -26547Q
  877.   if (/[a-zA-Z]{2}\d{3,}[a-zA-Z]/.test(line)) return true;
  878.   if (/[QqfsSBWnhmc]\d{3,}/.test(line) && !/scn/.test(line)) return true;
  879.   if (/\d{3,}[QqfhBWnm](?:\s|$)/.test(line)) return true;
  880.   // Numbers with impossible dots: 0.5.148
  881.   if (/\d+\.\d+\.\d+/.test(line) && !/Tm/.test(line) && !line.includes('(')) return true;
  882.   // Very long numbers (garbled)
  883.   if (/\d{6,}/.test(line) && !/Tm|cm/.test(line)) return true;
  884.   return false;
  885. }
  886.  
  887. /** Update /Length in stream dictionary after recompressing */
  888. function updateStreamLength(pdf: Buffer, pdfStr: string, streamStart: number, newLen: number): void {
  889.   // Search backward for the ">>" that ends the dictionary, then find /Length
  890.   // within the dictionary only (not in preceding stream data).
  891.   const area = pdfStr.substring(Math.max(0, streamStart - 500), streamStart);
  892.   // Find the last ">>" before "stream" — that's the dictionary end
  893.   const dictEnd = area.lastIndexOf('>>');
  894.   if (dictEnd === -1) return;
  895.   // Find the start of this object's dictionary (look for "obj")
  896.   const objStart = area.lastIndexOf(' 0 obj');
  897.   const dictStart = objStart >= 0 ? objStart : 0;
  898.   const dictOnly = area.substring(dictStart, dictEnd + 2);
  899.   const lengthMatch = dictOnly.match(/\/Length\s+(\d+)/);
  900.   if (!lengthMatch) return;
  901.   const oldLen = lengthMatch[1];
  902.   const padded = String(newLen).padStart(oldLen.length, ' ');
  903.   const offset = Math.max(0, streamStart - 500) + dictStart + dictOnly.lastIndexOf('/Length ' + oldLen) + 8;
  904.   for (let i = 0; i < oldLen.length; i++) pdf[offset + i] = padded.charCodeAt(i);
  905. }
  906.  
  907. // ── Memory-efficient brute-force l→1 ────────────────────────────────
  908.  
  909. /**
  910.  * Try l→1 substitutions using direct stream data manipulation.
  911.  * No full PDF re-decode per trial — only modifies affected bytes in
  912.  * a pre-extracted stream data buffer, then tries inflate.
  913.  */
  914. function bruteForceL1(
  915.   b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  916.   maxDepth: number = 4,
  917. ): { fixed: boolean; changes: number; depth: number } {
  918.   const b64Start = pdfToB64(stream.start);
  919.   const b64End = Math.min(b64Buf.length, pdfToB64(stream.end) + 4);
  920.  
  921.   const lPos: number[] = [];
  922.   for (let i = b64Start; i < b64End; i++) {
  923.     if (b64Buf[i] === L_BYTE) lPos.push(i);
  924.   }
  925.  
  926.   // Depth 1
  927.   for (const p of lPos) {
  928.     const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, p, ONE_BYTE);
  929.     if (changed && canInflate(streamData)) {
  930.       b64Buf[p] = ONE_BYTE;
  931.       return { fixed: true, changes: 1, depth: 1 };
  932.     }
  933.     revertB64Change(streamData, stream.start, p, oldBytes);
  934.   }
  935.   if (maxDepth < 2) return { fixed: false, changes: 0, depth: 0 };
  936.  
  937.   // Depth 2
  938.   for (let i = 0; i < lPos.length; i++) {
  939.     const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
  940.     b64Buf[lPos[i]] = ONE_BYTE;
  941.     for (let j = i + 1; j < lPos.length; j++) {
  942.       const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
  943.       if (r2.changed && canInflate(streamData)) {
  944.         b64Buf[lPos[j]] = ONE_BYTE;
  945.         return { fixed: true, changes: 2, depth: 2 };
  946.       }
  947.       revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
  948.     }
  949.     b64Buf[lPos[i]] = L_BYTE;
  950.     revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  951.   }
  952.   if (maxDepth < 3 || lPos.length > 50) return { fixed: false, changes: 0, depth: 0 };
  953.  
  954.   // Depth 3
  955.   for (let i = 0; i < lPos.length; i++) {
  956.     const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
  957.     b64Buf[lPos[i]] = ONE_BYTE;
  958.     for (let j = i + 1; j < lPos.length; j++) {
  959.       const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
  960.       b64Buf[lPos[j]] = ONE_BYTE;
  961.       for (let k = j + 1; k < lPos.length; k++) {
  962.         const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
  963.         if (r3.changed && canInflate(streamData)) {
  964.           b64Buf[lPos[k]] = ONE_BYTE;
  965.           return { fixed: true, changes: 3, depth: 3 };
  966.         }
  967.         revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
  968.       }
  969.       b64Buf[lPos[j]] = L_BYTE;
  970.       revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
  971.     }
  972.     b64Buf[lPos[i]] = L_BYTE;
  973.     revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  974.   }
  975.   if (maxDepth < 4 || lPos.length > 45) return { fixed: false, changes: 0, depth: 0 };
  976.  
  977.   // Depth 4
  978.   for (let i = 0; i < lPos.length; i++) {
  979.     const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
  980.     b64Buf[lPos[i]] = ONE_BYTE;
  981.     for (let j = i + 1; j < lPos.length; j++) {
  982.       const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
  983.       b64Buf[lPos[j]] = ONE_BYTE;
  984.       for (let k = j + 1; k < lPos.length; k++) {
  985.         const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
  986.         b64Buf[lPos[k]] = ONE_BYTE;
  987.         for (let m = k + 1; m < lPos.length; m++) {
  988.           const r4 = applyB64Change(b64Buf, streamData, stream.start, lPos[m], ONE_BYTE);
  989.           if (r4.changed && canInflate(streamData)) {
  990.             b64Buf[lPos[m]] = ONE_BYTE;
  991.             return { fixed: true, changes: 4, depth: 4 };
  992.           }
  993.           revertB64Change(streamData, stream.start, lPos[m], r4.oldBytes);
  994.         }
  995.         b64Buf[lPos[k]] = L_BYTE;
  996.         revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
  997.       }
  998.       b64Buf[lPos[j]] = L_BYTE;
  999.       revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
  1000.     }
  1001.     b64Buf[lPos[i]] = L_BYTE;
  1002.     revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
  1003.   }
  1004.  
  1005.   return { fixed: false, changes: 0, depth: 0 };
  1006. }
  1007.  
  1008. // ── Iterative error fixing ───────────────────────────────────────────
  1009.  
  1010. /**
  1011.  * Check if inflate can get past a known error position.
  1012.  * Returns true if the error at errPos is no longer present
  1013.  * (may still fail later at a different position).
  1014.  */
  1015. function passesErrPos(streamData: Buffer, errPos: number): boolean {
  1016.   if (errPos < 3 || streamData.length < 3) return false;
  1017.   const testLen = Math.min(streamData.length - 2, errPos + 3);
  1018.   try {
  1019.     inflateRawSync(streamData.subarray(2, 2 + testLen), { finishFlush: constants.Z_SYNC_FLUSH });
  1020.     return true;
  1021.   } catch (e: any) {
  1022.     const msg = String(e.message || '');
  1023.     return msg.includes('unexpected end') || msg.includes('buffer error') || msg.includes('incomplete');
  1024.   }
  1025. }
  1026.  
  1027. /**
  1028.  * Iteratively fix stream errors by finding and fixing one corruption at a time.
  1029.  * For each error position, tries all base64 substitutions in a window before the
  1030.  * error. Keeps changes that push the error position significantly forward.
  1031.  * Reverts all changes if the stream doesn't ultimately decompress.
  1032.  */
  1033. function iterativeErrorFix(
  1034.   b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  1035.   maxIter: number = 15,
  1036.   windowB64: number = 300,
  1037. ): { fixed: boolean; totalChanges: number; desc: string } {
  1038.   const changes: { pos: number; from: number; oldBytes: [number, number, number] }[] = [];
  1039.  
  1040.   for (let iter = 0; iter < maxIter; iter++) {
  1041.     if (canInflate(streamData)) {
  1042.       return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes in ${iter} iter` };
  1043.     }
  1044.  
  1045.     const errPos = findCorruptionOffset(streamData);
  1046.     if (errPos >= streamData.length - 4) {
  1047.       // Error at end — try checksum repair
  1048.       if (tryFixChecksumOnData(streamData)) {
  1049.         return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes + checksum` };
  1050.       }
  1051.       break;
  1052.     }
  1053.  
  1054.     const errPdfPos = stream.start + errPos;
  1055.     const b64Center = pdfToB64(errPdfPos);
  1056.     const searchLo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
  1057.     const searchHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);
  1058.  
  1059.     let bestPos = -1;
  1060.     let bestTo = 0;
  1061.     let bestNewErrPos = errPos;
  1062.     let bestOldBytes: [number, number, number] = [0, 0, 0];
  1063.  
  1064.     for (let i = searchLo; i < searchHi; i++) {
  1065.       const origChar = b64Buf[i];
  1066.       // Try confusion pairs first (more likely correct), then all other chars
  1067.       const confPairs = SUBS_MAP.get(origChar) || [];
  1068.       const allChars: number[] = [...confPairs];
  1069.       for (let c = 0; c < 64; c++) {
  1070.         const ch = B64_CHARS.charCodeAt(c);
  1071.         if (ch !== origChar && !confPairs.includes(ch)) allChars.push(ch);
  1072.       }
  1073.  
  1074.       for (const newChar of allChars) {
  1075.         const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
  1076.         if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
  1077.  
  1078.         if (canInflate(streamData)) {
  1079.           b64Buf[i] = newChar;
  1080.           return { fixed: true, totalChanges: changes.length + 1, desc: `${changes.length + 1} fixes` };
  1081.         }
  1082.  
  1083.         if (passesErrPos(streamData, errPos)) {
  1084.           const newErrPos = findCorruptionOffset(streamData);
  1085.           if (newErrPos > bestNewErrPos) {
  1086.             bestNewErrPos = newErrPos;
  1087.             bestPos = i;
  1088.             bestTo = newChar;
  1089.             bestOldBytes = [...oldBytes] as [number, number, number];
  1090.           }
  1091.         }
  1092.  
  1093.         revertB64Change(streamData, stream.start, i, oldBytes);
  1094.       }
  1095.     }
  1096.  
  1097.     if (bestPos === -1 || bestNewErrPos <= errPos) break;
  1098.  
  1099.     // Apply best change
  1100.     const origChar = b64Buf[bestPos];
  1101.     applyB64Change(b64Buf, streamData, stream.start, bestPos, bestTo);
  1102.     b64Buf[bestPos] = bestTo;
  1103.     changes.push({ pos: bestPos, from: origChar, oldBytes: bestOldBytes });
  1104.   }
  1105.  
  1106.   // Check one more time (might have fixed all data errors, just checksum left)
  1107.   if (canInflate(streamData)) {
  1108.     return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes` };
  1109.   }
  1110.  
  1111.   // Not fixed — revert all changes in reverse order
  1112.   for (let i = changes.length - 1; i >= 0; i--) {
  1113.     const ch = changes[i];
  1114.     applyB64Change(b64Buf, streamData, stream.start, ch.pos, ch.from);
  1115.     b64Buf[ch.pos] = ch.from;
  1116.   }
  1117.   const partial = changes.length > 0 ? ` (${changes.length} partial, reverted)` : '';
  1118.   return { fixed: false, totalChanges: 0, desc: `no convergence${partial}` };
  1119. }
  1120.  
  1121. /**
  1122.  * Try to fix Adler-32 checksum on a standalone stream data buffer.
  1123.  * Modifies the buffer in-place if successful.
  1124.  */
  1125. function tryFixChecksumOnData(streamData: Buffer): boolean {
  1126.   if (streamData.length < 6) return false;
  1127.   for (const trim of [2, 1, 0]) {
  1128.     const actualEnd = streamData.length - trim;
  1129.     if (actualEnd < 6) continue;
  1130.     let decompressed: Buffer;
  1131.     try { decompressed = inflateRawSync(streamData.subarray(2, actualEnd)); } catch { continue; }
  1132.     const adler = computeAdler32(decompressed);
  1133.     const pos = actualEnd - 4;
  1134.     const old = [streamData[pos], streamData[pos+1], streamData[pos+2], streamData[pos+3]];
  1135.     streamData[pos]     = (adler >>> 24) & 0xFF;
  1136.     streamData[pos + 1] = (adler >>> 16) & 0xFF;
  1137.     streamData[pos + 2] = (adler >>>  8) & 0xFF;
  1138.     streamData[pos + 3] =  adler         & 0xFF;
  1139.     if (canInflate(streamData)) return true;
  1140.     streamData[pos] = old[0]; streamData[pos+1] = old[1]; streamData[pos+2] = old[2]; streamData[pos+3] = old[3];
  1141.   }
  1142.   return false;
  1143. }
  1144.  
  1145. // ── DFS search with backtracking ─────────────────────────────────────
  1146.  
  1147. /**
  1148.  * Depth-first search for multi-error streams.
  1149.  * At each level, find the error position, try substitutions, and for the
  1150.  * top candidates that push past the error, recurse to fix the next error.
  1151.  * Backtracks if a path doesn't converge.
  1152.  */
  1153. function dfsErrorSearch(
  1154.   b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  1155.   maxDepth: number = 8,
  1156.   branchFactor: number = 2,
  1157.   windowB64: number = 200,
  1158. ): { fixed: boolean; totalChanges: number; desc: string } {
  1159.   const applied: { pos: number; origChar: number }[] = [];
  1160.   let nodeCount = 0;
  1161.   const maxNodes = 500; // Limit total search nodes to prevent OOM
  1162.  
  1163.   function search(depth: number): boolean {
  1164.     if (canInflate(streamData)) return true;
  1165.     if (depth >= maxDepth || nodeCount >= maxNodes) {
  1166.       return tryFixChecksumOnData(streamData);
  1167.     }
  1168.     nodeCount++;
  1169.  
  1170.     const errPos = findCorruptionOffset(streamData);
  1171.     if (errPos >= streamData.length - 4) {
  1172.       return tryFixChecksumOnData(streamData);
  1173.     }
  1174.  
  1175.     const errPdfPos = stream.start + errPos;
  1176.     const b64Center = pdfToB64(errPdfPos);
  1177.     const lo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
  1178.     const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);
  1179.  
  1180.     // Find candidates — confusion pairs first, fallback to all chars
  1181.     const candidates: { pos: number; to: number; improvement: number }[] = [];
  1182.  
  1183.     // Pass 1: confusion pairs only
  1184.     for (let i = lo; i < hi; i++) {
  1185.       const origChar = b64Buf[i];
  1186.       const confPairs = SUBS_MAP.get(origChar);
  1187.       if (!confPairs) continue;
  1188.  
  1189.       for (const newChar of confPairs) {
  1190.         const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
  1191.         if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
  1192.  
  1193.         if (canInflate(streamData)) {
  1194.           b64Buf[i] = newChar;
  1195.           applied.push({ pos: i, origChar });
  1196.           return true;
  1197.         }
  1198.  
  1199.         let improvement = 0;
  1200.         if (passesErrPos(streamData, errPos)) {
  1201.           const newErrPos = findCorruptionOffset(streamData);
  1202.           improvement = newErrPos - errPos;
  1203.         }
  1204.  
  1205.         revertB64Change(streamData, stream.start, i, oldBytes);
  1206.  
  1207.         if (improvement > 0) {
  1208.           candidates.push({ pos: i, to: newChar, improvement });
  1209.         }
  1210.       }
  1211.     }
  1212.  
  1213.     // Pass 2: if confusion pairs found nothing, try all chars in very tight window
  1214.     if (candidates.length === 0) {
  1215.       const tightLo = Math.max(pdfToB64(stream.start), b64Center - 40);
  1216.       const tightHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 10);
  1217.       for (let i = tightLo; i < tightHi; i++) {
  1218.         const origChar = b64Buf[i];
  1219.         for (let c = 0; c < 64; c++) {
  1220.           const newChar = B64_CHARS.charCodeAt(c);
  1221.           if (newChar === origChar) continue;
  1222.           if (SUBS_MAP.get(origChar)?.includes(newChar)) continue; // already tried
  1223.  
  1224.           const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
  1225.           if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
  1226.  
  1227.           if (canInflate(streamData)) {
  1228.             b64Buf[i] = newChar;
  1229.             applied.push({ pos: i, origChar });
  1230.             return true;
  1231.           }
  1232.  
  1233.           let improvement = 0;
  1234.           if (passesErrPos(streamData, errPos)) {
  1235.             const newErrPos = findCorruptionOffset(streamData);
  1236.             improvement = newErrPos - errPos;
  1237.           }
  1238.  
  1239.           revertB64Change(streamData, stream.start, i, oldBytes);
  1240.  
  1241.           if (improvement > 0) {
  1242.             candidates.push({ pos: i, to: newChar, improvement });
  1243.           }
  1244.         }
  1245.       }
  1246.     }
  1247.  
  1248.     candidates.sort((a, b) => b.improvement - a.improvement);
  1249.  
  1250.     for (const cand of candidates.slice(0, branchFactor)) {
  1251.       if (nodeCount >= maxNodes) break;
  1252.       const origChar = b64Buf[cand.pos];
  1253.       applyB64Change(b64Buf, streamData, stream.start, cand.pos, cand.to);
  1254.       b64Buf[cand.pos] = cand.to;
  1255.       applied.push({ pos: cand.pos, origChar });
  1256.  
  1257.       if (search(depth + 1)) return true;
  1258.  
  1259.       applied.pop();
  1260.       applyB64Change(b64Buf, streamData, stream.start, cand.pos, origChar);
  1261.       b64Buf[cand.pos] = origChar;
  1262.     }
  1263.  
  1264.     return false;
  1265.   }
  1266.  
  1267.   const success = search(0);
  1268.  
  1269.   if (!success) {
  1270.     for (let i = applied.length - 1; i >= 0; i--) {
  1271.       const { pos, origChar } = applied[i];
  1272.       applyB64Change(b64Buf, streamData, stream.start, pos, origChar);
  1273.       b64Buf[pos] = origChar;
  1274.     }
  1275.     return { fixed: false, totalChanges: 0, desc: `DFS exhausted (${nodeCount} nodes)` };
  1276.   }
  1277.  
  1278.   return { fixed: true, totalChanges: applied.length, desc: `DFS ${applied.length} fixes` };
  1279. }
  1280.  
  1281. // ── Position-guided search (all confusion pairs, memory-efficient) ──
  1282.  
  1283. function positionGuidedSearch(
  1284.   b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
  1285.   windowRadius: number = 60,
  1286. ): { fixed: boolean; changes: number; desc: string } {
  1287.   const errPos = findCorruptionOffset(streamData);
  1288.   const errorPdfPos = stream.start + errPos;
  1289.   const b64Center = pdfToB64(errorPdfPos);
  1290.  
  1291.   interface Cand { pos: number; from: number; to: number }
  1292.   const candidates: Cand[] = [];
  1293.   const lo = Math.max(pdfToB64(stream.start), b64Center - windowRadius);
  1294.   const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + windowRadius);
  1295.   for (let i = lo; i < hi; i++) {
  1296.     const ch = b64Buf[i];
  1297.     const subs = SUBS_MAP.get(ch);
  1298.     if (subs) {
  1299.       for (const to of subs) candidates.push({ pos: i, from: ch, to });
  1300.     }
  1301.   }
  1302.  
  1303.   if (candidates.length === 0) {
  1304.     return { fixed: false, changes: 0, desc: `no candidates near byte ${errPos}` };
  1305.   }
  1306.  
  1307.   // Depth 1
  1308.   for (const c of candidates) {
  1309.     const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, c.pos, c.to);
  1310.     if (changed && canInflate(streamData)) {
  1311.       b64Buf[c.pos] = c.to;
  1312.       return { fixed: true, changes: 1, desc: `1x ${String.fromCharCode(c.from)}->${String.fromCharCode(c.to)} near byte ${errPos}` };
  1313.     }
  1314.     revertB64Change(streamData, stream.start, c.pos, oldBytes);
  1315.   }
  1316.  
  1317.   // Depth 2
  1318.   const maxDouble = Math.min(candidates.length, 40);
  1319.   for (let i = 0; i < maxDouble; i++) {
  1320.     const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
  1321.     b64Buf[candidates[i].pos] = candidates[i].to;
  1322.     for (let j = i + 1; j < maxDouble; j++) {
  1323.       if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
  1324.       const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
  1325.       if (r2.changed && canInflate(streamData)) {
  1326.         b64Buf[candidates[j].pos] = candidates[j].to;
  1327.         return { fixed: true, changes: 2, desc: `2x near byte ${errPos}` };
  1328.       }
  1329.       revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
  1330.     }
  1331.     b64Buf[candidates[i].pos] = candidates[i].from;
  1332.     revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
  1333.   }
  1334.  
  1335.   // Depth 3
  1336.   const maxTriple = Math.min(candidates.length, 25);
  1337.   for (let i = 0; i < maxTriple; i++) {
  1338.     const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
  1339.     b64Buf[candidates[i].pos] = candidates[i].to;
  1340.     for (let j = i + 1; j < maxTriple; j++) {
  1341.       if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
  1342.       const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
  1343.       b64Buf[candidates[j].pos] = candidates[j].to;
  1344.       for (let k = j + 1; k < maxTriple; k++) {
  1345.         if (b64Buf[candidates[k].pos] !== candidates[k].from) continue;
  1346.         const r3 = applyB64Change(b64Buf, streamData, stream.start, candidates[k].pos, candidates[k].to);
  1347.         if (r3.changed && canInflate(streamData)) {
  1348.           b64Buf[candidates[k].pos] = candidates[k].to;
  1349.           return { fixed: true, changes: 3, desc: `3x near byte ${errPos}` };
  1350.         }
  1351.         revertB64Change(streamData, stream.start, candidates[k].pos, r3.oldBytes);
  1352.       }
  1353.       b64Buf[candidates[j].pos] = candidates[j].from;
  1354.       revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
  1355.     }
  1356.     b64Buf[candidates[i].pos] = candidates[i].from;
  1357.     revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
  1358.   }
  1359.  
  1360.   return { fixed: false, changes: 0, desc: `no fix (${candidates.length} candidates near byte ${errPos}/${stream.size})` };
  1361. }
  1362.  
  1363. // ── Main ────────────────────────────────────────────────────────────
  1364.  
  1365. async function main() {
  1366.   const args = process.argv.slice(2);
  1367.  
  1368.   if (args.length < 2) {
  1369.     console.log("Usage: bun recover_pdf.ts <input.txt> <output.pdf>");
  1370.     process.exit(1);
  1371.   }
  1372.  
  1373.   const inputFile = args[0];
  1374.   const outputFile = args[1];
  1375.  
  1376.   console.log("=".repeat(64));
  1377.   console.log("  PDF Recovery Script");
  1378.   console.log("=".repeat(64));
  1379.  
  1380.   // ─── Step 1: Read and clean base64 ──────────────────────────────
  1381.  
  1382.   console.log(`\n[1/6] Reading ${inputFile}...`);
  1383.   const rawBase64 = readFileSync(inputFile, "utf-8").trim();
  1384.   console.log(`  ${rawBase64.length} characters (with whitespace)`);
  1385.  
  1386.   let base64Clean = rawBase64.replace(/\s+/g, '');
  1387.   console.log(`  ${base64Clean.length} base64 characters`);
  1388.  
  1389.   // ─── Step 2: Auto-discover and fix OCR errors ──────────────────
  1390.  
  1391.   console.log("\n[2/6] Auto-discovering and fixing OCR errors...");
  1392.   const autoResult = autoDiscoverAndFixOCRErrors(base64Clean);
  1393.   let base64Buf = autoResult.base64Buf;
  1394.   let pdf = autoResult.pdf;
  1395.   let pdfStr = autoResult.pdfStr;
  1396.   console.log(`  ${autoResult.fixCount} auto-discovered fixes applied`);
  1397.  
  1398.   // ─── Step 3: Analyze streams ────────────────────────────────────
  1399.  
  1400.   console.log("\n[3/6] Analyzing streams...");
  1401.   pdf = Buffer.from(base64Buf.toString(), 'base64');
  1402.   pdfStr = pdf.toString('binary');
  1403.  
  1404.   let allStreams = findAllStreams(pdfStr);
  1405.   let flatStreams = allStreams.filter(s => s.hasFilter);
  1406.   const nonFlatStreams = allStreams.filter(s => !s.hasFilter);
  1407.   let okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;
  1408.  
  1409.   console.log(`  Found ${allStreams.length} streams total`);
  1410.   console.log(`    ${flatStreams.length} FlateDecode (compressed), ${okCount} decompress OK`);
  1411.   console.log(`    ${nonFlatStreams.length} uncompressed (skipped)`);
  1412.  
  1413.   const pageContentObjs = findPageContentObjects(pdfStr);
  1414.  
  1415.   // ─── Step 4: Fix Adler-32 checksums ─────────────────────────────
  1416.  
  1417.   console.log("\n[4/6] Repairing Adler-32 checksums...");
  1418.   let checksumFixed = 0;
  1419.  
  1420.   for (const stream of flatStreams) {
  1421.     if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
  1422.     if (tryFixChecksum(pdf, stream)) {
  1423.       checksumFixed++;
  1424.       console.log(`  Object ${stream.obj}: checksum fixed`);
  1425.     }
  1426.   }
  1427.  
  1428.   if (checksumFixed > 0) {
  1429.     base64Buf = Buffer.from(pdf.toString('base64'));
  1430.     pdfStr = pdf.toString('binary');
  1431.     console.log(`  ${checksumFixed} streams fixed via checksum repair`);
  1432.   } else {
  1433.     console.log("  (no checksum-only errors found)");
  1434.   }
  1435.  
  1436.   // Refresh stream list
  1437.   allStreams = findAllStreams(pdfStr);
  1438.   flatStreams = allStreams.filter(s => s.hasFilter);
  1439.   okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;
  1440.   console.log(`  Status: ${okCount}/${flatStreams.length} compressed streams OK`);
  1441.  
  1442.   // ─── Step 5: Stream repair ──────────────────────────────────────
  1443.  
  1444.   console.log("\n[5/6] Stream repair...");
  1445.  
  1446.   const failedStreams = flatStreams
  1447.     .filter(s => !canInflate(pdf.subarray(s.start, s.end)))
  1448.     .sort((a, b) => {
  1449.       const aPage = pageContentObjs.includes(a.obj);
  1450.       const bPage = pageContentObjs.includes(b.obj);
  1451.       if (aPage && !bPage) return -1;
  1452.       if (!aPage && bPage) return 1;
  1453.       return a.size - b.size;
  1454.     });
  1455.  
  1456.   console.log(`  ${failedStreams.length} streams to repair\n`);
  1457.  
  1458.   let totalFixed = checksumFixed;
  1459.  
  1460.   for (const stream of failedStreams) {
  1461.     // Re-decode PDF to get current state (only once per stream, not per trial)
  1462.     pdf = Buffer.from(base64Buf.toString(), 'base64');
  1463.     pdfStr = pdf.toString('binary');
  1464.     const currentStream = findObjStream(pdfStr, stream.obj) ||
  1465.                           findAllStreams(pdfStr).find(s => s.obj === stream.obj);
  1466.     if (!currentStream) continue;
  1467.  
  1468.     // Extract stream data for in-place manipulation
  1469.     const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
  1470.     if (canInflate(streamData)) continue;
  1471.  
  1472.     const isPage = pageContentObjs.includes(stream.obj);
  1473.     const label = isPage ? '[PAGE]' : '      ';
  1474.     process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} ${label} (${currentStream.size}b): `);
  1475.  
  1476.     // Phase A: Position-guided search with confusion pairs (memory-efficient)
  1477.     const guided = positionGuidedSearch(base64Buf, streamData, currentStream);
  1478.     if (guided.fixed) {
  1479.       totalFixed++;
  1480.       console.log(`FIXED [guided] ${guided.desc}`);
  1481.       continue;
  1482.     }
  1483.  
  1484.     // Phase B: Brute-force l→1 (memory-efficient)
  1485.     const maxBfDepth = currentStream.size < 3000 ? 4 : 2;
  1486.     const bf = bruteForceL1(base64Buf, streamData, currentStream, maxBfDepth);
  1487.     if (bf.fixed) {
  1488.       totalFixed++;
  1489.       console.log(`FIXED [brute-force] ${bf.changes}x l->1 (depth ${bf.depth})`);
  1490.       continue;
  1491.     }
  1492.  
  1493.     // Phase C: Iterative error fixing (greedy, one error at a time)
  1494.     const maxIter = currentStream.size < 5000 ? 15 : 10;
  1495.     const window = currentStream.size < 5000 ? 300 : 200;
  1496.     const iterResult = iterativeErrorFix(base64Buf, streamData, currentStream, maxIter, window);
  1497.     if (iterResult.fixed) {
  1498.       totalFixed++;
  1499.       console.log(`FIXED [iterative] ${iterResult.desc}`);
  1500.       continue;
  1501.     }
  1502.  
  1503.     // Not fixed
  1504.     const err = getError(streamData);
  1505.     console.log(`not fixed (${err.substring(0, 45)}) ${iterResult.desc}`);
  1506.   }
  1507.  
  1508.   // ─── Step 5b: Post-repair checksum pass ─────────────────────────
  1509.   // Iterative fixes corrected data errors in b64Buf but checksum
  1510.   // patches were only applied to local streamData buffers. Re-decode
  1511.   // and repair checksums on all remaining failing streams.
  1512.  
  1513.   pdf = Buffer.from(base64Buf.toString(), 'base64');
  1514.   pdfStr = pdf.toString('binary');
  1515.  
  1516.   console.log("\n  Post-repair checksum pass...");
  1517.   let postChecksumFixed = 0;
  1518.   const refreshedStreams = findAllStreams(pdfStr).filter(s => s.hasFilter);
  1519.   for (const stream of refreshedStreams) {
  1520.     if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
  1521.     if (tryFixChecksum(pdf, stream)) {
  1522.       postChecksumFixed++;
  1523.       totalFixed++;
  1524.     }
  1525.   }
  1526.   if (postChecksumFixed > 0) {
  1527.     console.log(`  ${postChecksumFixed} additional streams fixed via checksum repair`);
  1528.     pdfStr = pdf.toString('binary');
  1529.   } else {
  1530.     console.log("  (no additional checksum fixes)");
  1531.   }
  1532.  
  1533.   // ─── Step 5c: Second pass with wider search for remaining ──────
  1534.  
  1535.   const stillFailing = findAllStreams(pdfStr)
  1536.     .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
  1537.  
  1538.   if (stillFailing.length > 0) {
  1539.     // Re-sync base64Buf with the checksum-repaired pdf
  1540.     base64Buf = Buffer.from(pdf.toString('base64'));
  1541.  
  1542.     console.log(`\n  Second pass (wider search) for ${stillFailing.length} remaining streams...\n`);
  1543.     for (const stream of stillFailing) {
  1544.       pdf = Buffer.from(base64Buf.toString(), 'base64');
  1545.       pdfStr = pdf.toString('binary');
  1546.       const currentStream = findObjStream(pdfStr, stream.obj) ||
  1547.                             findAllStreams(pdfStr).find(s => s.obj === stream.obj);
  1548.       if (!currentStream) continue;
  1549.  
  1550.       const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
  1551.       if (canInflate(streamData)) continue;
  1552.  
  1553.       process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);
  1554.  
  1555.       if (currentStream.size > 15000) {
  1556.         console.log(`skipped (too large for DFS)`);
  1557.         continue;
  1558.       }
  1559.  
  1560.       // DFS with backtracking — explores alternative fixes when greedy gets stuck
  1561.       const dfsResult = dfsErrorSearch(base64Buf, streamData, currentStream, 12, 2, 300);
  1562.       if (dfsResult.fixed) {
  1563.         totalFixed++;
  1564.         console.log(`FIXED [DFS] ${dfsResult.desc}`);
  1565.       } else {
  1566.         console.log(`not fixed ${dfsResult.desc}`);
  1567.       }
  1568.     }
  1569.  
  1570.     // Final checksum repair for any new fixes from second pass
  1571.     pdf = Buffer.from(base64Buf.toString(), 'base64');
  1572.     pdfStr = pdf.toString('binary');
  1573.     for (const stream of findAllStreams(pdfStr).filter(s => s.hasFilter)) {
  1574.       if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
  1575.       if (tryFixChecksum(pdf, stream)) { totalFixed++; postChecksumFixed++; }
  1576.     }
  1577.     pdfStr = pdf.toString('binary');
  1578.   }
  1579.  
  1580.   // ─── Step 5d: Rust-accelerated repair for remaining streams ─────
  1581.  
  1582.   const rustFailing = findAllStreams(pdfStr)
  1583.     .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
  1584.  
  1585.   if (rustFailing.length > 0) {
  1586.     const rustBin = new URL("stream_fixer/target/release/stream_fixer", import.meta.url).pathname;
  1587.     const hasTool = existsSync(rustBin);
  1588.  
  1589.     if (hasTool) {
  1590.       console.log(`\n  Rust-accelerated repair for ${rustFailing.length} remaining streams...\n`);
  1591.  
  1592.       // Write current b64 state to a temp file
  1593.       base64Buf = Buffer.from(pdf.toString('base64'));
  1594.       const tmpB64 = `/tmp/pdf_recover_b64_${process.pid}.txt`;
  1595.       writeFileSync(tmpB64, base64Buf);
  1596.  
  1597.       for (const stream of rustFailing) {
  1598.         const currentStream = findObjStream(pdfStr, stream.obj) ||
  1599.                               findAllStreams(pdfStr).find(s => s.obj === stream.obj);
  1600.         if (!currentStream) continue;
  1601.  
  1602.         process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);
  1603.  
  1604.         const result = spawnSync(rustBin, [
  1605.           tmpB64, String(currentStream.start), String(currentStream.end),
  1606.           "--max-iter", "30", "--max-nodes", "100000",
  1607.           "--window", "600", "--branch", "3", "--max-depth", "20"
  1608.         ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], timeout: 300000 });
  1609.  
  1610.         if (result.status !== 0) {
  1611.           console.log(`error (${(result.stderr || "").substring(0, 60)})`);
  1612.           continue;
  1613.         }
  1614.  
  1615.         try {
  1616.           const output = JSON.parse(result.stdout);
  1617.           if (output.fixed && output.changes && output.changes.length > 0) {
  1618.             // Apply the b64 changes
  1619.             for (const ch of output.changes) {
  1620.               const pos = ch.b64_pos;
  1621.               base64Buf[pos] = ch.new_char.charCodeAt(0);
  1622.             }
  1623.             // Re-decode PDF and update
  1624.             pdf = Buffer.from(base64Buf.toString(), 'base64');
  1625.             pdfStr = pdf.toString('binary');
  1626.             // Write updated b64 for next stream
  1627.             writeFileSync(tmpB64, base64Buf);
  1628.  
  1629.             // Checksum repair on this stream
  1630.             const fixed = findObjStream(pdfStr, stream.obj) ||
  1631.                           findAllStreams(pdfStr).find(s => s.obj === stream.obj);
  1632.             if (fixed && fixed.hasFilter && !canInflate(pdf.subarray(fixed.start, fixed.end))) {
  1633.               if (tryFixChecksum(pdf, fixed)) {
  1634.                 pdfStr = pdf.toString('binary');
  1635.                 base64Buf = Buffer.from(pdf.toString('base64'));
  1636.                 writeFileSync(tmpB64, base64Buf);
  1637.               }
  1638.             }
  1639.  
  1640.             const nowOk = fixed && canInflate(pdf.subarray(fixed!.start, fixed!.end));
  1641.             totalFixed += nowOk ? 1 : 0;
  1642.             console.log(`${nowOk ? 'FIXED' : 'partial'} [Rust] ${output.desc}`);
  1643.           } else if (output.fixed) {
  1644.             console.log(`already OK [Rust] ${output.desc}`);
  1645.           } else {
  1646.             console.log(`not fixed [Rust] ${output.desc}`);
  1647.           }
  1648.         } catch (e) {
  1649.           console.log(`parse error: ${result.stdout.substring(0, 60)}`);
  1650.         }
  1651.       }
  1652.  
  1653.       // Clean up temp file
  1654.       try { unlinkSync(tmpB64); } catch {}
  1655.     } else {
  1656.       console.log(`\n  (Rust tool not found at ${rustBin} — skipping accelerated repair)`);
  1657.       console.log(`  Build with: cd stream_fixer && cargo build --release`);
  1658.     }
  1659.   }
  1660.  
  1661.   // ─── Step 5e: Stream-end (BFINAL) repair for remaining streams ──
  1662.  
  1663.   const bfinalFailing = findAllStreams(pdfStr)
  1664.     .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
  1665.  
  1666.   if (bfinalFailing.length > 0) {
  1667.     console.log(`\n  Stream-end repair for ${bfinalFailing.length} remaining streams...\n`);
  1668.     for (const stream of bfinalFailing) {
  1669.       process.stdout.write(`  Obj ${String(stream.obj).padStart(2)} (${stream.size}b): `);
  1670.       if (tryFixStreamEnd(pdf, stream)) {
  1671.         totalFixed++;
  1672.         pdfStr = pdf.toString('binary');
  1673.         console.log('FIXED [BFINAL]');
  1674.       } else {
  1675.         console.log('not fixed');
  1676.       }
  1677.     }
  1678.   }
  1679.  
  1680.   // ─── Step 5f: Visual repair (ObjStm alignment, color, content) ──
  1681.  
  1682.   console.log('\n  Visual repair...');
  1683.   pdfStr = fixObjStmAlignment(pdf, pdfStr);
  1684.   pdfStr = truncateGarbledContentStreams(pdf, pdfStr);
  1685.   pdfStr = replaceColorSpacesWithCMYK(pdf, pdfStr);
  1686.   console.log('  done.');
  1687.  
  1688.   // ─── Step 6: Final output ───────────────────────────────────────
  1689.  
  1690.   console.log("\n" + "=".repeat(64));
  1691.   console.log("  RESULTS");
  1692.   console.log("=".repeat(64));
  1693.  
  1694.   // Per-object detection for accurate results
  1695.   let finalOk = 0;
  1696.   let finalTotal = 0;
  1697.   const stillFailingList: { obj: number; size: number; err: string }[] = [];
  1698.  
  1699.   for (const origStream of flatStreams) {
  1700.     const s = findObjStream(pdfStr, origStream.obj) ||
  1701.               findAllStreams(pdfStr).find(x => x.obj === origStream.obj);
  1702.     if (!s || !s.hasFilter) continue;
  1703.     finalTotal++;
  1704.     const data = pdf.subarray(s.start, s.end);
  1705.     if (canInflate(data)) {
  1706.       finalOk++;
  1707.     } else {
  1708.       stillFailingList.push({ obj: s.obj, size: s.size, err: getError(data) });
  1709.     }
  1710.   }
  1711.  
  1712.   console.log(`\nCompressed streams: ${finalOk}/${finalTotal} OK`);
  1713.   console.log(`Total streams fixed: ${totalFixed}`);
  1714.  
  1715.   if (pageContentObjs.length > 0) {
  1716.     let pageOk = 0;
  1717.     console.log("\nPage content streams:");
  1718.     for (const objNum of pageContentObjs) {
  1719.       const s = findObjStream(pdfStr, objNum);
  1720.       if (s && s.hasFilter) {
  1721.         const ok = canInflate(pdf.subarray(s.start, s.end));
  1722.         if (ok) pageOk++;
  1723.         console.log(`  Object ${objNum}: ${ok ? 'OK' : 'FAIL'}`);
  1724.       }
  1725.     }
  1726.     console.log(`Page content: ${pageOk}/${pageContentObjs.length} OK`);
  1727.   }
  1728.  
  1729.   if (stillFailingList.length > 0) {
  1730.     console.log(`\nStill failing (${stillFailingList.length}):`);
  1731.     for (const s of stillFailingList) {
  1732.       console.log(`  Obj ${s.obj}: ${s.size}b - ${s.err.substring(0, 50)}`);
  1733.     }
  1734.   }
  1735.  
  1736.   // Save raw repaired PDF
  1737.   const rawFile = outputFile.replace(/\.pdf$/, "-raw.pdf");
  1738.   writeFileSync(rawFile, pdf);
  1739.   console.log(`\nSaved raw: ${rawFile}`);
  1740.  
  1741.   // Extract text and re-distill
  1742.   console.log("\n[6/6] Extracting text and re-distilling...");
  1743.   let rawText = extractText(rawFile);
  1744.   let rawLines = rawText.split('\n').filter((l: string) => l.trim());
  1745.   console.log(`  Raw PDF text: ${rawLines.length} non-empty lines`);
  1746.  
  1747.   if (runGhostscript(rawFile, outputFile)) {
  1748.     console.log(`  Saved: ${outputFile}`);
  1749.     const distilledText = extractText(outputFile);
  1750.     const distilledLines = distilledText.split('\n').filter((l: string) => l.trim());
  1751.     console.log(`  Re-distilled text: ${distilledLines.length} non-empty lines`);
  1752.  
  1753.     const bestText = distilledLines.length >= rawLines.length ? distilledText : rawText;
  1754.     const bestLines = bestText.split('\n').filter((l: string) => l.trim());
  1755.     const bestSource = distilledLines.length >= rawLines.length ? 'distilled' : 'raw';
  1756.  
  1757.     console.log(`\n--- Extracted Text (${bestSource}, first 40 lines) ---`);
  1758.     if (bestLines.length > 0) {
  1759.       console.log(bestLines.slice(0, 40).join('\n'));
  1760.       console.log(`\n(${bestLines.length} total non-empty lines)`);
  1761.     } else {
  1762.       console.log("(no text extracted)");
  1763.     }
  1764.   } else {
  1765.     console.log(`  Ghostscript re-distill failed, using raw PDF`);
  1766.     writeFileSync(outputFile, pdf);
  1767.   }
  1768.  
  1769.   // Keep raw file for debugging
  1770.   // if (existsSync(rawFile) && existsSync(outputFile) && rawFile !== outputFile) {
  1771.   //   unlinkSync(rawFile);
  1772.   // }
  1773. }
  1774.  
  1775. main().catch(console.error);
  1776.  
Add Comment
Please, Sign In to add comment