Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env bun
- /**
- * PDF Recovery Script
- *
- * Recovers a corrupted base64-encoded PDF that was OCR'd with errors.
- * Techniques: text substitutions, Adler-32 checksum repair,
- * error-position-guided fuzzing with OCR confusion pairs,
- * brute-force l→1 substitution.
- *
- * Memory-efficient: modifies stream data bytes in-place instead of
- * re-decoding the entire PDF for each trial.
- *
- * Usage: bun recover_pdf.ts <input.txt> <output.pdf>
- */
- import { readFileSync, writeFileSync, existsSync, unlinkSync } from "fs";
- import { inflateSync, inflateRawSync, deflateSync, constants } from "zlib";
- import { spawnSync } from "child_process";
- // ── Utilities ───────────────────────────────────────────────────────
- function commandExists(cmd: string): boolean {
- return spawnSync("which", [cmd], { encoding: "utf-8" }).status === 0;
- }
- function runGhostscript(inputPath: string, outputPath: string): boolean {
- if (!commandExists("gs")) {
- console.error("\nGhostscript (gs) not found. Install with: brew install ghostscript");
- return false;
- }
- if (existsSync(outputPath)) unlinkSync(outputPath);
- spawnSync("gs", [
- "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
- "-dNOPAUSE", "-dQUIET", "-dBATCH",
- `-sOutputFile=${outputPath}`, inputPath
- ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
- return existsSync(outputPath);
- }
- function extractText(pdfPath: string): string {
- if (!commandExists("gs")) return "(gs not available)";
- const result = spawnSync("gs", [
- "-sDEVICE=txtwrite", "-sOutputFile=-",
- "-dQUIET", "-dNOPAUSE", "-dBATCH", pdfPath
- ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"] });
- return result.stdout || "";
- }
- // ── PDF Token Dictionary ─────────────────────────────────────────────
- // Known valid tokens in PDF files for auto-discovering OCR corruption.
- const PDF_TOKENS: Set<string> = new Set([
- // Dictionary keys
- "/Type", "/Subtype", "/Filter", "/Length", "/Width", "/Height",
- "/Name", "/Names", "/Pages", "/Page", "/Root", "/Info", "/Size",
- "/Resources", "/Contents", "/MediaBox", "/CropBox", "/BleedBox",
- "/TrimBox", "/ArtBox", "/Font", "/XObject", "/ExtGState",
- "/ColorSpace", "/ProcSet", "/Encoding", "/BaseFont", "/Metadata",
- "/Predictor", "/Columns", "/Colors", "/DecodeParms", "/Decode",
- "/Linearized", "/Parent", "/Kids", "/Count", "/Rotate",
- "/Annots", "/Border", "/Rect", "/Dest", "/Action", "/URI", "/S",
- "/FontDescriptor", "/FontName", "/FontFile", "/FontFile2",
- "/FontFile3", "/Flags", "/ItalicAngle", "/StemV", "/StemH",
- "/Ascent", "/Descent", "/CapHeight", "/XHeight", "/MissingWidth",
- "/FontBBox", "/FirstChar", "/LastChar", "/Widths", "/AvgWidth",
- "/MaxWidth", "/Leading", "/ToUnicode", "/CIDSystemInfo",
- "/DescendantFonts", "/BaseEncoding", "/Differences", "/DW", "/W",
- "/BitsPerComponent", "/ImageMask", "/Mask", "/SMask",
- "/Interpolate", "/Intent", "/ColorTransform",
- "/ID", "/Index", "/Prev", "/N", "/O", "/P", "/E", "/T", "/H", "/L",
- "/Producer", "/Creator", "/CreationDate", "/ModDate", "/LastModified",
- "/Title", "/Author", "/Subject", "/Keywords", "/Trapped",
- "/GTS_PDFXVersion", "/OutputIntents", "/DestOutputProfile",
- "/MarkInfo", "/StructTreeRoot", "/Lang",
- "/ViewerPreferences", "/PageLayout", "/PageMode",
- "/Outlines", "/Threads", "/OpenAction", "/AcroForm", "/Fields",
- "/Prop_Build", "/Properties", "/Group", "/K", "/Pg",
- "/BBox", "/Matrix", "/FormType", "/OC", "/OCGs", "/OCProperties",
- "/Registry", "/Ordering", "/Supplement",
- // Type values
- "/Catalog", "/FontDescriptor", "/Image", "/Form",
- "/Annot", "/Link", "/Text", "/Widget", "/ObjStm", "/XRef",
- // Filter values
- "/FlateDecode", "/DCTDecode", "/ASCII85Decode", "/ASCIIHexDecode",
- "/LZWDecode", "/RunLengthDecode", "/CCITTFaxDecode",
- "/JBIG2Decode", "/JPXDecode", "/Crypt",
- // Color spaces
- "/DeviceRGB", "/DeviceCMYK", "/DeviceGray", "/DeviceN",
- "/ICCBased", "/Indexed", "/CalRGB", "/CalGray", "/Lab",
- "/Pattern", "/Separation",
- // ProcSet values
- "/PDF", "/ImageB", "/ImageC", "/ImageI",
- // XMP namespaces (bare, no leading /)
- "xmpG", "xmpMM", "xmpTPg", "stRef", "stFnt", "stEvt",
- "pdfx", "pdfaid",
- // Common parameter values (bare)
- "FlateDecode", "DCTDecode", "CMYK", "Name", "ProcSet",
- "Predictor", "mode", "LastModified",
- ]);
- // ── OCR Confusion Model ─────────────────────────────────────────────
- // Maps characters to what OCR commonly misreads them as (bidirectional).
- const OCR_CONFUSIONS: Map<string, string[]> = new Map([
- [".", ["/"]],
- ["/", ["."]],
- ["l", ["1", "I"]],
- ["1", ["l", "I"]],
- ["I", ["l", "1", "M"]],
- ["O", ["0"]],
- ["0", ["O"]],
- ["m", ["i"]],
- ["i", ["m"]],
- ["M", ["I"]],
- ["c", ["b"]],
- ["b", ["c"]],
- ["5", ["S", "%"]],
- ["S", ["5"]],
- ["%", ["5"]],
- ["8", ["B"]],
- ["B", ["8"]],
- ["e", ["u"]],
- ["u", ["e"]],
- ]);
- // ── Auto-discovery of OCR errors ────────────────────────────────────
- interface DiscoveredFix {
- offset: number;
- original: string;
- replacement: string;
- rule: string;
- }
- /** Find plaintext regions (everything outside stream...endstream) */
- function findPlaintextRegions(pdfStr: string): { start: number; end: number }[] {
- const regions: { start: number; end: number }[] = [];
- const streamRe = /stream[\r\n][\s\S]*?endstream/g;
- let lastEnd = 0;
- let m;
- while ((m = streamRe.exec(pdfStr)) !== null) {
- if (m.index > lastEnd) regions.push({ start: lastEnd, end: m.index });
- lastEnd = m.index + m[0].length;
- }
- if (lastEnd < pdfStr.length) regions.push({ start: lastEnd, end: pdfStr.length });
- return regions;
- }
- /** Try single-char OCR confusions to match a token against the dictionary */
- function tryOCRConfusions(token: string): string | null {
- // 1-char substitutions
- for (let i = 0; i < token.length; i++) {
- const confusions = OCR_CONFUSIONS.get(token[i]);
- if (!confusions) continue;
- for (const sub of confusions) {
- if (sub.length !== 1) continue;
- const candidate = token.substring(0, i) + sub + token.substring(i + 1);
- if (PDF_TOKENS.has(candidate)) return candidate;
- }
- }
- // 2-char substitutions for longer tokens
- if (token.length > 4) {
- for (let i = 0; i < token.length; i++) {
- const c1 = OCR_CONFUSIONS.get(token[i]);
- if (!c1) continue;
- for (const s1 of c1) {
- if (s1.length !== 1) continue;
- const partial = token.substring(0, i) + s1 + token.substring(i + 1);
- for (let j = i + 1; j < partial.length; j++) {
- const c2 = OCR_CONFUSIONS.get(partial[j]);
- if (!c2) continue;
- for (const s2 of c2) {
- if (s2.length !== 1) continue;
- const candidate = partial.substring(0, j) + s2 + partial.substring(j + 1);
- if (PDF_TOKENS.has(candidate)) return candidate;
- }
- }
- }
- }
- }
- return null;
- }
- /**
- * Auto-discover and fix OCR errors in the decoded PDF using PDF spec knowledge.
- * Replaces hardcoded BASE64_SUBSTITUTIONS and DECODED_FIXES.
- */
- function autoDiscoverAndFixOCRErrors(base64Clean: string): {
- base64Buf: Buffer; pdf: Buffer; pdfStr: string; fixCount: number;
- } {
- let pdf = Buffer.from(base64Clean, 'base64');
- let pdfStr = pdf.toString('binary');
- const regions = findPlaintextRegions(pdfStr);
- const fixes: DiscoveredFix[] = [];
- const seen = new Set<number>();
- for (const region of regions) {
- const text = pdfStr.substring(region.start, region.end);
- // Pattern 1: PDF names — /Word or .Word (dot may be corrupted slash)
- const nameRe = /[./][A-Z][A-Za-z0-9_]*/g;
- let m;
- while ((m = nameRe.exec(text)) !== null) {
- const token = m[0];
- const absOff = region.start + m.index;
- if (seen.has(absOff)) continue;
- if (token.startsWith(".")) {
- // Try . → / directly
- const slashed = "/" + token.substring(1);
- if (PDF_TOKENS.has(slashed)) {
- fixes.push({ offset: absOff, original: token, replacement: slashed,
- rule: `. -> / (${slashed})` });
- seen.add(absOff);
- continue;
- }
- // Try . → / plus further OCR fix
- const further = tryOCRConfusions(slashed);
- if (further) {
- fixes.push({ offset: absOff, original: token, replacement: further,
- rule: `. -> / + OCR fix (${further})` });
- seen.add(absOff);
- continue;
- }
- }
- if (token.startsWith("/") && !PDF_TOKENS.has(token)) {
- const fixed = tryOCRConfusions(token);
- if (fixed) {
- fixes.push({ offset: absOff, original: token, replacement: fixed,
- rule: `OCR fix (${fixed})` });
- seen.add(absOff);
- }
- }
- }
- // Pattern 2: Bare words (XMP namespaces, parameter values)
- const wordRe = /(?<![/.\w])[a-zA-Z][a-zA-Z]{2,20}(?![a-zA-Z])/g;
- while ((m = wordRe.exec(text)) !== null) {
- const token = m[0];
- const absOff = region.start + m.index;
- if (seen.has(absOff) || PDF_TOKENS.has(token)) continue;
- const fixed = tryOCRConfusions(token);
- if (fixed) {
- fixes.push({ offset: absOff, original: token, replacement: fixed,
- rule: `bare word OCR fix (${fixed})` });
- seen.add(absOff);
- }
- }
- // Pattern 3: Structural . → / after >> or ] or digits
- const structRe = /(>>|]|\d)\.([\s\r\n/A-Z])/g;
- while ((m = structRe.exec(text)) !== null) {
- const dotOff = region.start + m.index + m[1].length;
- if (seen.has(dotOff)) continue;
- fixes.push({ offset: dotOff, original: ".", replacement: "/",
- rule: `. -> / after "${m[1]}"` });
- seen.add(dotOff);
- }
- // Pattern 4: % in hex context (likely misread 5)
- const hexRe = /%([0-9A-Fa-f]{4,})/g;
- while ((m = hexRe.exec(text)) !== null) {
- const absOff = region.start + m.index;
- if (seen.has(absOff)) continue;
- fixes.push({ offset: absOff, original: "%", replacement: "5",
- rule: `% -> 5 in hex context` });
- seen.add(absOff);
- }
- }
- // Apply fixes (reverse offset order to preserve positions)
- fixes.sort((a, b) => b.offset - a.offset);
- for (const fix of fixes) {
- pdfStr = pdfStr.substring(0, fix.offset) + fix.replacement +
- pdfStr.substring(fix.offset + fix.original.length);
- }
- // Log discoveries (sorted by offset for readability)
- const sorted = [...fixes].sort((a, b) => a.offset - b.offset);
- for (const fix of sorted) {
- console.log(` "${fix.original}" -> "${fix.replacement}" [${fix.rule}]`);
- }
- if (fixes.length > 0) {
- pdf = Buffer.from(pdfStr, 'binary');
- }
- const base64Buf = Buffer.from(pdf.toString('base64'));
- return { base64Buf, pdf, pdfStr, fixCount: fixes.length };
- }
- // ── Base64 helpers ──────────────────────────────────────────────────
- const B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- const B64_VAL = new Uint8Array(128);
- for (let i = 0; i < 64; i++) B64_VAL[B64_CHARS.charCodeAt(i)] = i;
- const L_BYTE = 'l'.charCodeAt(0);
- const ONE_BYTE = '1'.charCodeAt(0);
- function pdfToB64(pdfPos: number): number {
- return Math.floor(pdfPos / 3) * 4;
- }
- /** Decode one base64 group (4 chars → 3 bytes) */
- function decodeGroup(b64Buf: Buffer, groupStart: number): [number, number, number] {
- const a = B64_VAL[b64Buf[groupStart]];
- const b = B64_VAL[b64Buf[groupStart + 1]];
- const c = B64_VAL[b64Buf[groupStart + 2]];
- const d = B64_VAL[b64Buf[groupStart + 3]];
- return [(a << 2) | (b >> 4), ((b & 0xF) << 4) | (c >> 2), ((c & 3) << 6) | d];
- }
- /**
- * Apply a base64 char change to a stream data buffer IN PLACE.
- * Returns the old bytes so the change can be reverted.
- * Only modifies bytes that fall within the stream data range.
- */
- function applyB64Change(
- b64Buf: Buffer, streamData: Buffer, streamStart: number,
- b64Pos: number, newChar: number
- ): { oldBytes: [number, number, number]; changed: boolean } {
- const orig = b64Buf[b64Pos];
- const groupStart = Math.floor(b64Pos / 4) * 4;
- const pdfByteStart = (groupStart / 4) * 3;
- // Get old decoded bytes for this group
- const old: [number, number, number] = [...decodeGroup(b64Buf, groupStart)] as [number, number, number];
- // Temporarily apply change and decode
- b64Buf[b64Pos] = newChar;
- const [n0, n1, n2] = decodeGroup(b64Buf, groupStart);
- b64Buf[b64Pos] = orig;
- let changed = false;
- const offsets = [pdfByteStart - streamStart, pdfByteStart - streamStart + 1, pdfByteStart - streamStart + 2];
- const newBytes = [n0, n1, n2];
- for (let k = 0; k < 3; k++) {
- if (offsets[k] >= 0 && offsets[k] < streamData.length && newBytes[k] !== streamData[offsets[k]]) {
- streamData[offsets[k]] = newBytes[k];
- changed = true;
- }
- }
- // Return the old values for the 3 positions (for reverting)
- return { oldBytes: old, changed };
- }
- /** Revert a base64 change on stream data */
- function revertB64Change(
- streamData: Buffer, streamStart: number,
- b64Pos: number, oldBytes: [number, number, number]
- ): void {
- const groupStart = Math.floor(b64Pos / 4) * 4;
- const pdfByteStart = (groupStart / 4) * 3;
- for (let k = 0; k < 3; k++) {
- const off = pdfByteStart - streamStart + k;
- if (off >= 0 && off < streamData.length) {
- streamData[off] = oldBytes[k];
- }
- }
- }
- // ── OCR confusion pairs ─────────────────────────────────────────────
- const CONFUSION_PAIRS: [number, number][] = [
- [0x6C, 0x31], [0x31, 0x6C], // l ↔ 1
- [0x4F, 0x30], [0x30, 0x4F], // O ↔ 0
- [0x49, 0x6C], [0x6C, 0x49], // I ↔ l
- [0x49, 0x31], [0x31, 0x49], // I ↔ 1
- [0x35, 0x53], [0x53, 0x35], // 5 ↔ S
- [0x38, 0x42], [0x42, 0x38], // 8 ↔ B
- ];
- const SUBS_MAP = new Map<number, number[]>();
- for (const [from, to] of CONFUSION_PAIRS) {
- if (!SUBS_MAP.has(from)) SUBS_MAP.set(from, []);
- const arr = SUBS_MAP.get(from)!;
- if (!arr.includes(to)) arr.push(to);
- }
- // ── Types ───────────────────────────────────────────────────────────
- interface StreamInfo {
- obj: number;
- start: number;
- end: number;
- size: number;
- hasFilter: boolean;
- }
- // ── Stream detection ────────────────────────────────────────────────
- function findAllStreams(pdfStr: string): StreamInfo[] {
- const streams: StreamInfo[] = [];
- const re = />>\s*stream[\r\n]+/g;
- let m;
- while ((m = re.exec(pdfStr)) !== null) {
- const dataStart = m.index + m[0].length;
- const lookback = pdfStr.substring(Math.max(0, m.index - 2000), m.index);
- const lastObj = lookback.lastIndexOf(' 0 obj');
- if (lastObj === -1) continue;
- const lastEndobj = lookback.lastIndexOf('endobj');
- if (lastEndobj > lastObj) continue;
- const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
- const numMatch = beforeObj.match(/(\d+)\s*$/);
- if (!numMatch) continue;
- const objNum = parseInt(numMatch[1]);
- const dictText = lookback.substring(lastObj);
- const end = pdfStr.indexOf('endstream', dataStart);
- if (end === -1) continue;
- const hasFilter = /FlateDecode/.test(dictText);
- streams.push({ obj: objNum, start: dataStart, end, size: end - dataStart, hasFilter });
- }
- return streams;
- }
- function findObjStream(pdfStr: string, objNum: number): StreamInfo | null {
- const pat = new RegExp('(?:^|[\\r\\n])' + objNum + '\\s+0\\s+obj', 'g');
- let om;
- while ((om = pat.exec(pdfStr)) !== null) {
- const objStart = om.index;
- const after = pdfStr.substring(objStart, objStart + 50000);
- const sm = after.match(/>>\s*stream[\r\n]+/);
- if (!sm) continue;
- const between = after.substring(0, sm.index! + sm[0].length);
- if (/endobj/.test(between)) continue;
- const start = objStart + sm.index! + sm[0].length;
- const end = pdfStr.indexOf('endstream', start);
- if (end === -1) continue;
- const hasFilter = /FlateDecode/.test(between);
- return { obj: objNum, start, end, size: end - start, hasFilter };
- }
- return null;
- }
- function findPageContentObjects(pdfStr: string): number[] {
- const contents: number[] = [];
- const re = /\/Type\s*\/Page\b(?!s)([\s\S]*?)(?=endobj)/g;
- let pm;
- while ((pm = re.exec(pdfStr)) !== null) {
- const chunk = pm[0] + pm[1];
- const single = chunk.match(/\/Contents\s+(\d+)\s+0\s+R/);
- if (single) contents.push(parseInt(single[1]));
- const arr = chunk.match(/\/Contents\s*\[([\d\s\nR]+)\]/);
- if (arr) {
- for (const ref of arr[1].matchAll(/(\d+)\s+0\s+R/g)) {
- contents.push(parseInt(ref[1]));
- }
- }
- }
- return [...new Set(contents)];
- }
- // ── Decompression helpers ───────────────────────────────────────────
- function canInflate(data: Buffer): boolean {
- try { inflateSync(data); return true; } catch { return false; }
- }
- function getError(data: Buffer): string {
- try { inflateSync(data); return "OK"; }
- catch (e: any) { return String(e.message || e).replace(/^Error.*?:\s*/, ''); }
- }
- function findCorruptionOffset(streamData: Buffer): number {
- if (streamData.length < 3) return 0;
- const rawData = streamData.subarray(2);
- let lo = 0, hi = rawData.length;
- while (lo < hi - 1) {
- const mid = Math.floor((lo + hi) / 2);
- try {
- inflateRawSync(rawData.subarray(0, mid), { finishFlush: constants.Z_SYNC_FLUSH });
- lo = mid;
- } catch (e: any) {
- const msg = String(e.message || '');
- if (msg.includes('unexpected end') || msg.includes('buffer error') ||
- msg.includes('incomplete') || msg.includes('need dictionary')) {
- lo = mid;
- } else { hi = mid; }
- }
- }
- return lo + 2;
- }
- // ── Adler-32 checksum repair ────────────────────────────────────────
- function computeAdler32(data: Buffer): number {
- let a = 1, b = 0;
- const MOD = 65521;
- for (let i = 0; i < data.length; i++) {
- a = (a + data[i]) % MOD;
- b = (b + a) % MOD;
- }
- return ((b << 16) | a) >>> 0;
- }
- function tryFixChecksum(pdf: Buffer, stream: StreamInfo): boolean {
- if (!stream.hasFilter || stream.size < 6) return false;
- // Strategy 1: Patch near the end (works when zlib data fills the stream)
- for (const trim of [2, 1, 0]) {
- const actualEnd = stream.end - trim;
- const streamData = pdf.subarray(stream.start, actualEnd);
- if (streamData.length < 6) continue;
- let decompressed: Buffer;
- try { decompressed = inflateRawSync(streamData.subarray(2)); } catch { continue; }
- const adler = computeAdler32(decompressed);
- const pos = actualEnd - 4;
- const old = [pdf[pos], pdf[pos+1], pdf[pos+2], pdf[pos+3]];
- pdf[pos] = (adler >>> 24) & 0xFF;
- pdf[pos + 1] = (adler >>> 16) & 0xFF;
- pdf[pos + 2] = (adler >>> 8) & 0xFF;
- pdf[pos + 3] = adler & 0xFF;
- if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
- // Revert
- pdf[pos] = old[0]; pdf[pos+1] = old[1]; pdf[pos+2] = old[2]; pdf[pos+3] = old[3];
- }
- // Strategy 2: Find actual deflate data end (handles trailing data after checksum)
- const raw = pdf.subarray(stream.start + 2, stream.end);
- let decompressed: Buffer;
- try { decompressed = inflateRawSync(raw); } catch { return false; }
- // Binary search for minimum raw bytes needed (= deflate data length)
- let lo = 1, hi = raw.length;
- while (lo + 1 < hi) {
- const mid = Math.floor((lo + hi) / 2);
- try { inflateRawSync(raw.subarray(0, mid)); hi = mid; } catch { lo = mid; }
- }
- const deflateLen = hi;
- const adler = computeAdler32(decompressed);
- const checksumPos = stream.start + 2 + deflateLen;
- if (checksumPos + 4 > stream.end) return false;
- const old = [pdf[checksumPos], pdf[checksumPos+1], pdf[checksumPos+2], pdf[checksumPos+3]];
- pdf[checksumPos] = (adler >>> 24) & 0xFF;
- pdf[checksumPos + 1] = (adler >>> 16) & 0xFF;
- pdf[checksumPos + 2] = (adler >>> 8) & 0xFF;
- pdf[checksumPos + 3] = adler & 0xFF;
- if (canInflate(pdf.subarray(stream.start, stream.end))) return true;
- // Revert
- pdf[checksumPos] = old[0]; pdf[checksumPos+1] = old[1]; pdf[checksumPos+2] = old[2]; pdf[checksumPos+3] = old[3];
- return false;
- }
- /**
- * Fix streams where BFINAL bit is missing/corrupted.
- * Detects via: strict inflateRawSync fails with "unexpected end of file"
- * but Z_SYNC_FLUSH mode succeeds. Then brute-forces 1-2 byte changes
- * near the stream end to restore the BFINAL marker, followed by
- * Adler-32 checksum repair.
- */
- function tryFixStreamEnd(pdf: Buffer, stream: StreamInfo): boolean {
- if (!stream.hasFilter || stream.size < 6) return false;
- const rawOrig = pdf.subarray(stream.start + 2, stream.end);
- // Only attempt if strict inflate fails with "unexpected end"
- try { inflateRawSync(rawOrig); return false; } catch (e: any) {
- if (!String(e.message || '').includes('unexpected end')) return false;
- }
- // Verify Z_SYNC_FLUSH works (data valid but BFINAL missing)
- try { inflateRawSync(rawOrig, { finishFlush: constants.Z_SYNC_FLUSH }); }
- catch { return false; }
- const raw = Buffer.from(rawOrig); // work on a copy
- const searchRange = Math.min(50, raw.length);
- const startIdx = Math.max(0, raw.length - searchRange);
- for (let i = startIdx; i < raw.length; i++) {
- const oi = raw[i];
- for (let v = 0; v < 256; v++) {
- if (v === oi) continue;
- raw[i] = v;
- let ok = false;
- try { inflateRawSync(raw); ok = true; } catch (e: any) {
- if (!String(e.message || '').includes('unexpected end')) {
- // Error shifted — search ±5 for second byte fix
- for (let j = Math.max(0, i - 5); j <= Math.min(raw.length - 1, i + 5); j++) {
- if (j === i) continue;
- const oj = raw[j];
- for (let w = 0; w < 256; w++) {
- if (w === oj) continue;
- raw[j] = w;
- try { inflateRawSync(raw); ok = true; } catch {}
- if (ok) {
- // Apply both changes to pdf, then fix checksum
- pdf[stream.start + 2 + i] = v;
- pdf[stream.start + 2 + j] = w;
- if (tryFixChecksum(pdf, stream)) return true;
- // Revert if checksum repair failed
- pdf[stream.start + 2 + i] = oi;
- pdf[stream.start + 2 + j] = oj;
- ok = false;
- }
- }
- raw[j] = oj;
- }
- }
- }
- if (ok) {
- // Single-byte fix succeeded
- pdf[stream.start + 2 + i] = v;
- if (tryFixChecksum(pdf, stream)) return true;
- pdf[stream.start + 2 + i] = oi; // revert
- }
- raw[i] = oi;
- }
- }
- return false;
- }
- // ── Visual repair: ObjStm alignment, color space, content truncation ─
- /**
- * Fix ObjStm /First header alignment. If the header byte count doesn't
- * match the /First value in the dictionary, pad with spaces so GS can
- * find embedded objects.
- */
- function fixObjStmAlignment(pdf: Buffer, pdfStr: string): string {
- const re = />>\s*stream[\r\n]+/g;
- let m;
- while ((m = re.exec(pdfStr)) !== null) {
- const start = m.index + m[0].length;
- const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
- const lastObj = lookback.lastIndexOf(' 0 obj');
- if (lastObj === -1) continue;
- const dictText = lookback.substring(lastObj + 6);
- if (!/\/ObjStm/.test(dictText)) continue;
- const end = pdfStr.indexOf('endstream', start);
- if (end === -1) continue;
- const data = pdf.subarray(start, end);
- let dec: Buffer;
- try { dec = inflateSync(data); } catch { continue; }
- const text = dec.toString('utf-8');
- const firstMatch = dictText.match(/\/First\s+(\d+)/);
- if (!firstMatch) continue;
- const firstValue = parseInt(firstMatch[1]);
- const headerMatch = text.match(/^([\d\s]+)/);
- if (!headerMatch) continue;
- const currentLen = headerMatch[0].length;
- if (currentLen === firstValue) continue;
- // Pad header to match /First
- const nums = headerMatch[1].trim().split(/\s+/).map(Number);
- let newHeader = '';
- for (let i = 0; i < nums.length; i += 2) {
- newHeader += nums[i] + ' ' + nums[i + 1] + ' ';
- }
- while (newHeader.length < firstValue) newHeader += ' ';
- const objData = text.substring(currentLen);
- const newContent = newHeader + objData;
- const compressed = deflateSync(Buffer.from(newContent));
- if (compressed.length > end - start) continue;
- compressed.copy(pdf, start);
- for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
- updateStreamLength(pdf, pdfStr, start, compressed.length);
- pdfStr = pdf.toString('binary');
- }
- return pdfStr;
- }
- /**
- * Auto-detect Separation color space → CMYK mappings from the PDF's
- * color space definition objects, then replace /CSn cs T scn operators
- * in content streams with direct CMYK k operators.
- *
- * For FunctionType 2 (exponential interpolation):
- * CMYK = C0 + tint^N * (C1 - C0)
- */
- function replaceColorSpacesWithCMYK(pdf: Buffer, pdfStr: string): string {
- // Step 1: Find /ColorSpace<<...>> in page Resource dictionaries
- // Maps CS name (e.g. "CS0") → object number
- const csRefs = new Map<string, number>();
- const csRefRe = /\/ColorSpace\s*<<([^>]+)>>/g;
- let csm;
- while ((csm = csRefRe.exec(pdfStr)) !== null) {
- const entries = csm[1].matchAll(/\/(CS\d+)\s+(\d+)\s+0\s+R/g);
- for (const e of entries) csRefs.set(e[1], parseInt(e[2]));
- }
- if (csRefs.size === 0) return pdfStr;
- // Step 2: Resolve each CS object to get C0, C1, N from the Separation definition
- // csMap: "CS0" → "1 0.57 0 0.38" (CMYK string for tint=1)
- const csMap = new Map<string, string>();
- for (const [csName, objNum] of csRefs) {
- const csContent = resolveObject(pdf, pdfStr, objNum);
- if (!csContent) continue;
- // Parse: [/Separation /name /DeviceCMYK << /C0[...] /C1[...] /N n ... >>]
- const c0Match = csContent.match(/\/C0\s*\[([^\]]+)\]/);
- const c1Match = csContent.match(/\/C1\s*\[([^\]]+)\]/);
- const nMatch = csContent.match(/\/N\s+([\d.]+)/);
- if (!c0Match || !c1Match) continue;
- const c0 = c0Match[1].trim().split(/\s+/).map(Number);
- const c1 = c1Match[1].trim().split(/\s+/).map(Number);
- const n = nMatch ? parseFloat(nMatch[1]) : 1;
- if (c0.length !== 4 || c1.length !== 4) continue;
- if (c0.some(isNaN) || c1.some(isNaN)) continue;
- // Compute CMYK at tint=1: C0 + 1^N * (C1 - C0) = C1
- const cmyk = c1.map((v, i) => {
- const val = c0[i] + Math.pow(1, n) * (v - c0[i]);
- return Number(val.toFixed(4)).toString();
- });
- csMap.set(csName, cmyk.join(' '));
- console.log(` Color space /${csName} (obj ${objNum}): → CMYK(${cmyk.join(', ')})`);
- }
- if (csMap.size === 0) return pdfStr;
- // Step 3: Replace /CSn cs T scn → CMYK k in content streams
- const streamRe = />>\s*stream[\r\n]+/g;
- let m;
- while ((m = streamRe.exec(pdfStr)) !== null) {
- const start = m.index + m[0].length;
- const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
- const lastObj = lookback.lastIndexOf(' 0 obj');
- if (lastObj === -1) continue;
- const beforeObj = lookback.substring(Math.max(0, lastObj - 10), lastObj);
- const numMatch = beforeObj.match(/(\d+)\s*$/);
- if (!numMatch) continue;
- const end = pdfStr.indexOf('endstream', start);
- if (end === -1) continue;
- const data = pdf.subarray(start, end);
- let dec: Buffer;
- try { dec = inflateSync(data); } catch { continue; }
- let text = dec.toString('utf-8');
- const orig = text;
- for (const [csName, cmyk] of csMap) {
- // Match /<CSname> cs <tint> scn (with flexible whitespace)
- const pat = new RegExp(`\\/${csName}\\s+cs\\s+([\\d.]+)\\s+scn`, 'g');
- text = text.replace(pat, (_: string, tint: string) => {
- // For tint != 1, would need to recompute, but tint=1 is by far most common
- if (parseFloat(tint) === 1) return `${cmyk} k`;
- // For other tint values, leave unchanged (rare)
- return _;
- });
- }
- if (text === orig) continue;
- const compressed = deflateSync(Buffer.from(text));
- if (compressed.length > end - start) continue;
- compressed.copy(pdf, start);
- for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
- updateStreamLength(pdf, pdfStr, start, compressed.length);
- pdfStr = pdf.toString('binary');
- }
- return pdfStr;
- }
- /**
- * Resolve an object by number — check ObjStm streams first, then standalone objects.
- * Returns the object's text content, or null if not found/unreadable.
- */
- function resolveObject(pdf: Buffer, pdfStr: string, objNum: number): string | null {
- // Try standalone object first
- const standaloneRe = new RegExp(`(?:^|[\\r\\n])${objNum}\\s+0\\s+obj\\b`);
- const sm = standaloneRe.exec(pdfStr);
- if (sm) {
- const objStart = sm.index + sm[0].length;
- const objEnd = pdfStr.indexOf('endobj', objStart);
- if (objEnd !== -1) return pdfStr.substring(objStart, objEnd).trim();
- }
- // Search ObjStm streams
- const re = />>\s*stream[\r\n]+/g;
- let m;
- while ((m = re.exec(pdfStr)) !== null) {
- const start = m.index + m[0].length;
- const lookback = pdfStr.substring(Math.max(0, m.index - 500), m.index);
- const lastObj = lookback.lastIndexOf(' 0 obj');
- if (lastObj === -1) continue;
- const dictText = lookback.substring(lastObj + 6);
- if (!/\/ObjStm/.test(dictText)) continue;
- const firstMatch = dictText.match(/\/First\s+(\d+)/);
- if (!firstMatch) continue;
- const first = parseInt(firstMatch[1]);
- const end = pdfStr.indexOf('endstream', start);
- if (end === -1) continue;
- const data = pdf.subarray(start, end);
- let dec: Buffer;
- try { dec = inflateSync(data); } catch { continue; }
- const text = dec.toString('utf-8');
- const headerMatch = text.match(/^([\d\s]+)/);
- if (!headerMatch) continue;
- const nums = headerMatch[1].trim().split(/\s+/).map(Number);
- for (let i = 0; i < nums.length; i += 2) {
- if (nums[i] !== objNum) continue;
- const offset = nums[i + 1];
- const nextOffset = i + 2 < nums.length ? nums[i + 3] : text.length - first;
- return text.substring(first + offset, first + nextOffset).trim();
- }
- }
- return null;
- }
- /**
- * Detect and truncate garbled content in decompressed page content streams.
- * OCR errors can produce wrong deflate output that inflates OK but contains
- * invalid PDF operators, causing accidental fills/clips.
- */
- function truncateGarbledContentStreams(pdf: Buffer, pdfStr: string): string {
- const re = />>\s*stream[\r\n]+/g;
- let m;
- while ((m = re.exec(pdfStr)) !== null) {
- const start = m.index + m[0].length;
- const lookback = pdfStr.substring(Math.max(0, m.index - 300), m.index);
- const lastObj = lookback.lastIndexOf(' 0 obj');
- if (lastObj === -1) continue;
- const dictText = lookback.substring(lastObj + 6);
- // Only process content streams (FlateDecode, not ObjStm/XRef)
- if (!/FlateDecode/.test(dictText)) continue;
- if (/\/ObjStm|\/XRef/.test(dictText)) continue;
- const end = pdfStr.indexOf('endstream', start);
- if (end === -1) continue;
- const data = pdf.subarray(start, end);
- let dec: Buffer;
- try { dec = inflateSync(data); } catch { continue; }
- const text = dec.toString('utf-8');
- const lines = text.split('\n');
- // Find first garbled line
- let firstBad = -1;
- for (let i = 0; i < lines.length; i++) {
- const line = lines[i].trim();
- if (line.length === 0) continue;
- if (isGarbledLine(line)) { firstBad = i; break; }
- }
- if (firstBad === -1) continue;
- // Track BT/q state in good portion
- let btDepth = 0, qDepth = 0;
- for (let i = 0; i < firstBad; i++) {
- const line = lines[i].trim();
- const bt = line.match(/\bBT\b/g), et = line.match(/\bET\b/g);
- const q = line.match(/\bq\b/g), Q = line.match(/\bQ\b/g);
- if (bt) btDepth += bt.length;
- if (et) btDepth -= et.length;
- if (q) qDepth += q.length;
- if (Q) qDepth -= Q.length;
- }
- let truncated = lines.slice(0, firstBad).join('\n') + '\n';
- if (btDepth > 0) truncated += 'ET\n';
- while (qDepth > 0) { truncated += 'Q\n'; qDepth--; }
- const compressed = deflateSync(Buffer.from(truncated));
- if (compressed.length > end - start) continue;
- compressed.copy(pdf, start);
- for (let i = start + compressed.length; i < end; i++) pdf[i] = 0;
- updateStreamLength(pdf, pdfStr, start, compressed.length);
- pdfStr = pdf.toString('binary');
- }
- return pdfStr;
- }
- function isGarbledLine(line: string): boolean {
- // Non-printable chars outside parenthesized strings
- let inStr = false, depth = 0, bad = 0;
- for (let j = 0; j < line.length; j++) {
- const c = line.charCodeAt(j);
- if (line[j] === '(' && !inStr) { inStr = true; depth++; }
- else if (line[j] === '(' && inStr) depth++;
- else if (line[j] === ')' && inStr) { depth--; if (depth === 0) inStr = false; }
- if (!inStr && (c < 32 || c > 126) && c !== 9) bad++;
- }
- if (bad > 0) return true;
- // Operators merged with numbers: Q703, -26547Q
- if (/[a-zA-Z]{2}\d{3,}[a-zA-Z]/.test(line)) return true;
- if (/[QqfsSBWnhmc]\d{3,}/.test(line) && !/scn/.test(line)) return true;
- if (/\d{3,}[QqfhBWnm](?:\s|$)/.test(line)) return true;
- // Numbers with impossible dots: 0.5.148
- if (/\d+\.\d+\.\d+/.test(line) && !/Tm/.test(line) && !line.includes('(')) return true;
- // Very long numbers (garbled)
- if (/\d{6,}/.test(line) && !/Tm|cm/.test(line)) return true;
- return false;
- }
- /** Update /Length in stream dictionary after recompressing */
- function updateStreamLength(pdf: Buffer, pdfStr: string, streamStart: number, newLen: number): void {
- // Search backward for the ">>" that ends the dictionary, then find /Length
- // within the dictionary only (not in preceding stream data).
- const area = pdfStr.substring(Math.max(0, streamStart - 500), streamStart);
- // Find the last ">>" before "stream" — that's the dictionary end
- const dictEnd = area.lastIndexOf('>>');
- if (dictEnd === -1) return;
- // Find the start of this object's dictionary (look for "obj")
- const objStart = area.lastIndexOf(' 0 obj');
- const dictStart = objStart >= 0 ? objStart : 0;
- const dictOnly = area.substring(dictStart, dictEnd + 2);
- const lengthMatch = dictOnly.match(/\/Length\s+(\d+)/);
- if (!lengthMatch) return;
- const oldLen = lengthMatch[1];
- const padded = String(newLen).padStart(oldLen.length, ' ');
- const offset = Math.max(0, streamStart - 500) + dictStart + dictOnly.lastIndexOf('/Length ' + oldLen) + 8;
- for (let i = 0; i < oldLen.length; i++) pdf[offset + i] = padded.charCodeAt(i);
- }
- // ── Memory-efficient brute-force l→1 ────────────────────────────────
- /**
- * Try l→1 substitutions using direct stream data manipulation.
- * No full PDF re-decode per trial — only modifies affected bytes in
- * a pre-extracted stream data buffer, then tries inflate.
- */
- function bruteForceL1(
- b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
- maxDepth: number = 4,
- ): { fixed: boolean; changes: number; depth: number } {
- const b64Start = pdfToB64(stream.start);
- const b64End = Math.min(b64Buf.length, pdfToB64(stream.end) + 4);
- const lPos: number[] = [];
- for (let i = b64Start; i < b64End; i++) {
- if (b64Buf[i] === L_BYTE) lPos.push(i);
- }
- // Depth 1
- for (const p of lPos) {
- const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, p, ONE_BYTE);
- if (changed && canInflate(streamData)) {
- b64Buf[p] = ONE_BYTE;
- return { fixed: true, changes: 1, depth: 1 };
- }
- revertB64Change(streamData, stream.start, p, oldBytes);
- }
- if (maxDepth < 2) return { fixed: false, changes: 0, depth: 0 };
- // Depth 2
- for (let i = 0; i < lPos.length; i++) {
- const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
- b64Buf[lPos[i]] = ONE_BYTE;
- for (let j = i + 1; j < lPos.length; j++) {
- const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
- if (r2.changed && canInflate(streamData)) {
- b64Buf[lPos[j]] = ONE_BYTE;
- return { fixed: true, changes: 2, depth: 2 };
- }
- revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
- }
- b64Buf[lPos[i]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
- }
- if (maxDepth < 3 || lPos.length > 50) return { fixed: false, changes: 0, depth: 0 };
- // Depth 3
- for (let i = 0; i < lPos.length; i++) {
- const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
- b64Buf[lPos[i]] = ONE_BYTE;
- for (let j = i + 1; j < lPos.length; j++) {
- const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
- b64Buf[lPos[j]] = ONE_BYTE;
- for (let k = j + 1; k < lPos.length; k++) {
- const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
- if (r3.changed && canInflate(streamData)) {
- b64Buf[lPos[k]] = ONE_BYTE;
- return { fixed: true, changes: 3, depth: 3 };
- }
- revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
- }
- b64Buf[lPos[j]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
- }
- b64Buf[lPos[i]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
- }
- if (maxDepth < 4 || lPos.length > 45) return { fixed: false, changes: 0, depth: 0 };
- // Depth 4
- for (let i = 0; i < lPos.length; i++) {
- const r1 = applyB64Change(b64Buf, streamData, stream.start, lPos[i], ONE_BYTE);
- b64Buf[lPos[i]] = ONE_BYTE;
- for (let j = i + 1; j < lPos.length; j++) {
- const r2 = applyB64Change(b64Buf, streamData, stream.start, lPos[j], ONE_BYTE);
- b64Buf[lPos[j]] = ONE_BYTE;
- for (let k = j + 1; k < lPos.length; k++) {
- const r3 = applyB64Change(b64Buf, streamData, stream.start, lPos[k], ONE_BYTE);
- b64Buf[lPos[k]] = ONE_BYTE;
- for (let m = k + 1; m < lPos.length; m++) {
- const r4 = applyB64Change(b64Buf, streamData, stream.start, lPos[m], ONE_BYTE);
- if (r4.changed && canInflate(streamData)) {
- b64Buf[lPos[m]] = ONE_BYTE;
- return { fixed: true, changes: 4, depth: 4 };
- }
- revertB64Change(streamData, stream.start, lPos[m], r4.oldBytes);
- }
- b64Buf[lPos[k]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[k], r3.oldBytes);
- }
- b64Buf[lPos[j]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[j], r2.oldBytes);
- }
- b64Buf[lPos[i]] = L_BYTE;
- revertB64Change(streamData, stream.start, lPos[i], r1.oldBytes);
- }
- return { fixed: false, changes: 0, depth: 0 };
- }
- // ── Iterative error fixing ───────────────────────────────────────────
- /**
- * Check if inflate can get past a known error position.
- * Returns true if the error at errPos is no longer present
- * (may still fail later at a different position).
- */
- function passesErrPos(streamData: Buffer, errPos: number): boolean {
- if (errPos < 3 || streamData.length < 3) return false;
- const testLen = Math.min(streamData.length - 2, errPos + 3);
- try {
- inflateRawSync(streamData.subarray(2, 2 + testLen), { finishFlush: constants.Z_SYNC_FLUSH });
- return true;
- } catch (e: any) {
- const msg = String(e.message || '');
- return msg.includes('unexpected end') || msg.includes('buffer error') || msg.includes('incomplete');
- }
- }
- /**
- * Iteratively fix stream errors by finding and fixing one corruption at a time.
- * For each error position, tries all base64 substitutions in a window before the
- * error. Keeps changes that push the error position significantly forward.
- * Reverts all changes if the stream doesn't ultimately decompress.
- */
- function iterativeErrorFix(
- b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
- maxIter: number = 15,
- windowB64: number = 300,
- ): { fixed: boolean; totalChanges: number; desc: string } {
- const changes: { pos: number; from: number; oldBytes: [number, number, number] }[] = [];
- for (let iter = 0; iter < maxIter; iter++) {
- if (canInflate(streamData)) {
- return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes in ${iter} iter` };
- }
- const errPos = findCorruptionOffset(streamData);
- if (errPos >= streamData.length - 4) {
- // Error at end — try checksum repair
- if (tryFixChecksumOnData(streamData)) {
- return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes + checksum` };
- }
- break;
- }
- const errPdfPos = stream.start + errPos;
- const b64Center = pdfToB64(errPdfPos);
- const searchLo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
- const searchHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);
- let bestPos = -1;
- let bestTo = 0;
- let bestNewErrPos = errPos;
- let bestOldBytes: [number, number, number] = [0, 0, 0];
- for (let i = searchLo; i < searchHi; i++) {
- const origChar = b64Buf[i];
- // Try confusion pairs first (more likely correct), then all other chars
- const confPairs = SUBS_MAP.get(origChar) || [];
- const allChars: number[] = [...confPairs];
- for (let c = 0; c < 64; c++) {
- const ch = B64_CHARS.charCodeAt(c);
- if (ch !== origChar && !confPairs.includes(ch)) allChars.push(ch);
- }
- for (const newChar of allChars) {
- const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
- if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
- if (canInflate(streamData)) {
- b64Buf[i] = newChar;
- return { fixed: true, totalChanges: changes.length + 1, desc: `${changes.length + 1} fixes` };
- }
- if (passesErrPos(streamData, errPos)) {
- const newErrPos = findCorruptionOffset(streamData);
- if (newErrPos > bestNewErrPos) {
- bestNewErrPos = newErrPos;
- bestPos = i;
- bestTo = newChar;
- bestOldBytes = [...oldBytes] as [number, number, number];
- }
- }
- revertB64Change(streamData, stream.start, i, oldBytes);
- }
- }
- if (bestPos === -1 || bestNewErrPos <= errPos) break;
- // Apply best change
- const origChar = b64Buf[bestPos];
- applyB64Change(b64Buf, streamData, stream.start, bestPos, bestTo);
- b64Buf[bestPos] = bestTo;
- changes.push({ pos: bestPos, from: origChar, oldBytes: bestOldBytes });
- }
- // Check one more time (might have fixed all data errors, just checksum left)
- if (canInflate(streamData)) {
- return { fixed: true, totalChanges: changes.length, desc: `${changes.length} fixes` };
- }
- // Not fixed — revert all changes in reverse order
- for (let i = changes.length - 1; i >= 0; i--) {
- const ch = changes[i];
- applyB64Change(b64Buf, streamData, stream.start, ch.pos, ch.from);
- b64Buf[ch.pos] = ch.from;
- }
- const partial = changes.length > 0 ? ` (${changes.length} partial, reverted)` : '';
- return { fixed: false, totalChanges: 0, desc: `no convergence${partial}` };
- }
- /**
- * Try to fix Adler-32 checksum on a standalone stream data buffer.
- * Modifies the buffer in-place if successful.
- */
- function tryFixChecksumOnData(streamData: Buffer): boolean {
- if (streamData.length < 6) return false;
- for (const trim of [2, 1, 0]) {
- const actualEnd = streamData.length - trim;
- if (actualEnd < 6) continue;
- let decompressed: Buffer;
- try { decompressed = inflateRawSync(streamData.subarray(2, actualEnd)); } catch { continue; }
- const adler = computeAdler32(decompressed);
- const pos = actualEnd - 4;
- const old = [streamData[pos], streamData[pos+1], streamData[pos+2], streamData[pos+3]];
- streamData[pos] = (adler >>> 24) & 0xFF;
- streamData[pos + 1] = (adler >>> 16) & 0xFF;
- streamData[pos + 2] = (adler >>> 8) & 0xFF;
- streamData[pos + 3] = adler & 0xFF;
- if (canInflate(streamData)) return true;
- streamData[pos] = old[0]; streamData[pos+1] = old[1]; streamData[pos+2] = old[2]; streamData[pos+3] = old[3];
- }
- return false;
- }
- // ── DFS search with backtracking ─────────────────────────────────────
- /**
- * Depth-first search for multi-error streams.
- * At each level, find the error position, try substitutions, and for the
- * top candidates that push past the error, recurse to fix the next error.
- * Backtracks if a path doesn't converge.
- */
- function dfsErrorSearch(
- b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
- maxDepth: number = 8,
- branchFactor: number = 2,
- windowB64: number = 200,
- ): { fixed: boolean; totalChanges: number; desc: string } {
- const applied: { pos: number; origChar: number }[] = [];
- let nodeCount = 0;
- const maxNodes = 500; // Limit total search nodes to prevent OOM
- function search(depth: number): boolean {
- if (canInflate(streamData)) return true;
- if (depth >= maxDepth || nodeCount >= maxNodes) {
- return tryFixChecksumOnData(streamData);
- }
- nodeCount++;
- const errPos = findCorruptionOffset(streamData);
- if (errPos >= streamData.length - 4) {
- return tryFixChecksumOnData(streamData);
- }
- const errPdfPos = stream.start + errPos;
- const b64Center = pdfToB64(errPdfPos);
- const lo = Math.max(pdfToB64(stream.start), b64Center - windowB64);
- const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + 20);
- // Find candidates — confusion pairs first, fallback to all chars
- const candidates: { pos: number; to: number; improvement: number }[] = [];
- // Pass 1: confusion pairs only
- for (let i = lo; i < hi; i++) {
- const origChar = b64Buf[i];
- const confPairs = SUBS_MAP.get(origChar);
- if (!confPairs) continue;
- for (const newChar of confPairs) {
- const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
- if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
- if (canInflate(streamData)) {
- b64Buf[i] = newChar;
- applied.push({ pos: i, origChar });
- return true;
- }
- let improvement = 0;
- if (passesErrPos(streamData, errPos)) {
- const newErrPos = findCorruptionOffset(streamData);
- improvement = newErrPos - errPos;
- }
- revertB64Change(streamData, stream.start, i, oldBytes);
- if (improvement > 0) {
- candidates.push({ pos: i, to: newChar, improvement });
- }
- }
- }
- // Pass 2: if confusion pairs found nothing, try all chars in very tight window
- if (candidates.length === 0) {
- const tightLo = Math.max(pdfToB64(stream.start), b64Center - 40);
- const tightHi = Math.min(pdfToB64(stream.end) + 4, b64Center + 10);
- for (let i = tightLo; i < tightHi; i++) {
- const origChar = b64Buf[i];
- for (let c = 0; c < 64; c++) {
- const newChar = B64_CHARS.charCodeAt(c);
- if (newChar === origChar) continue;
- if (SUBS_MAP.get(origChar)?.includes(newChar)) continue; // already tried
- const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, i, newChar);
- if (!changed) { revertB64Change(streamData, stream.start, i, oldBytes); continue; }
- if (canInflate(streamData)) {
- b64Buf[i] = newChar;
- applied.push({ pos: i, origChar });
- return true;
- }
- let improvement = 0;
- if (passesErrPos(streamData, errPos)) {
- const newErrPos = findCorruptionOffset(streamData);
- improvement = newErrPos - errPos;
- }
- revertB64Change(streamData, stream.start, i, oldBytes);
- if (improvement > 0) {
- candidates.push({ pos: i, to: newChar, improvement });
- }
- }
- }
- }
- candidates.sort((a, b) => b.improvement - a.improvement);
- for (const cand of candidates.slice(0, branchFactor)) {
- if (nodeCount >= maxNodes) break;
- const origChar = b64Buf[cand.pos];
- applyB64Change(b64Buf, streamData, stream.start, cand.pos, cand.to);
- b64Buf[cand.pos] = cand.to;
- applied.push({ pos: cand.pos, origChar });
- if (search(depth + 1)) return true;
- applied.pop();
- applyB64Change(b64Buf, streamData, stream.start, cand.pos, origChar);
- b64Buf[cand.pos] = origChar;
- }
- return false;
- }
- const success = search(0);
- if (!success) {
- for (let i = applied.length - 1; i >= 0; i--) {
- const { pos, origChar } = applied[i];
- applyB64Change(b64Buf, streamData, stream.start, pos, origChar);
- b64Buf[pos] = origChar;
- }
- return { fixed: false, totalChanges: 0, desc: `DFS exhausted (${nodeCount} nodes)` };
- }
- return { fixed: true, totalChanges: applied.length, desc: `DFS ${applied.length} fixes` };
- }
- // ── Position-guided search (all confusion pairs, memory-efficient) ──
- function positionGuidedSearch(
- b64Buf: Buffer, streamData: Buffer, stream: StreamInfo,
- windowRadius: number = 60,
- ): { fixed: boolean; changes: number; desc: string } {
- const errPos = findCorruptionOffset(streamData);
- const errorPdfPos = stream.start + errPos;
- const b64Center = pdfToB64(errorPdfPos);
- interface Cand { pos: number; from: number; to: number }
- const candidates: Cand[] = [];
- const lo = Math.max(pdfToB64(stream.start), b64Center - windowRadius);
- const hi = Math.min(pdfToB64(stream.end) + 4, b64Center + windowRadius);
- for (let i = lo; i < hi; i++) {
- const ch = b64Buf[i];
- const subs = SUBS_MAP.get(ch);
- if (subs) {
- for (const to of subs) candidates.push({ pos: i, from: ch, to });
- }
- }
- if (candidates.length === 0) {
- return { fixed: false, changes: 0, desc: `no candidates near byte ${errPos}` };
- }
- // Depth 1
- for (const c of candidates) {
- const { oldBytes, changed } = applyB64Change(b64Buf, streamData, stream.start, c.pos, c.to);
- if (changed && canInflate(streamData)) {
- b64Buf[c.pos] = c.to;
- return { fixed: true, changes: 1, desc: `1x ${String.fromCharCode(c.from)}->${String.fromCharCode(c.to)} near byte ${errPos}` };
- }
- revertB64Change(streamData, stream.start, c.pos, oldBytes);
- }
- // Depth 2
- const maxDouble = Math.min(candidates.length, 40);
- for (let i = 0; i < maxDouble; i++) {
- const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
- b64Buf[candidates[i].pos] = candidates[i].to;
- for (let j = i + 1; j < maxDouble; j++) {
- if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
- const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
- if (r2.changed && canInflate(streamData)) {
- b64Buf[candidates[j].pos] = candidates[j].to;
- return { fixed: true, changes: 2, desc: `2x near byte ${errPos}` };
- }
- revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
- }
- b64Buf[candidates[i].pos] = candidates[i].from;
- revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
- }
- // Depth 3
- const maxTriple = Math.min(candidates.length, 25);
- for (let i = 0; i < maxTriple; i++) {
- const r1 = applyB64Change(b64Buf, streamData, stream.start, candidates[i].pos, candidates[i].to);
- b64Buf[candidates[i].pos] = candidates[i].to;
- for (let j = i + 1; j < maxTriple; j++) {
- if (b64Buf[candidates[j].pos] !== candidates[j].from) continue;
- const r2 = applyB64Change(b64Buf, streamData, stream.start, candidates[j].pos, candidates[j].to);
- b64Buf[candidates[j].pos] = candidates[j].to;
- for (let k = j + 1; k < maxTriple; k++) {
- if (b64Buf[candidates[k].pos] !== candidates[k].from) continue;
- const r3 = applyB64Change(b64Buf, streamData, stream.start, candidates[k].pos, candidates[k].to);
- if (r3.changed && canInflate(streamData)) {
- b64Buf[candidates[k].pos] = candidates[k].to;
- return { fixed: true, changes: 3, desc: `3x near byte ${errPos}` };
- }
- revertB64Change(streamData, stream.start, candidates[k].pos, r3.oldBytes);
- }
- b64Buf[candidates[j].pos] = candidates[j].from;
- revertB64Change(streamData, stream.start, candidates[j].pos, r2.oldBytes);
- }
- b64Buf[candidates[i].pos] = candidates[i].from;
- revertB64Change(streamData, stream.start, candidates[i].pos, r1.oldBytes);
- }
- return { fixed: false, changes: 0, desc: `no fix (${candidates.length} candidates near byte ${errPos}/${stream.size})` };
- }
- // ── Main ────────────────────────────────────────────────────────────
- async function main() {
- const args = process.argv.slice(2);
- if (args.length < 2) {
- console.log("Usage: bun recover_pdf.ts <input.txt> <output.pdf>");
- process.exit(1);
- }
- const inputFile = args[0];
- const outputFile = args[1];
- console.log("=".repeat(64));
- console.log(" PDF Recovery Script");
- console.log("=".repeat(64));
- // ─── Step 1: Read and clean base64 ──────────────────────────────
- console.log(`\n[1/6] Reading ${inputFile}...`);
- const rawBase64 = readFileSync(inputFile, "utf-8").trim();
- console.log(` ${rawBase64.length} characters (with whitespace)`);
- let base64Clean = rawBase64.replace(/\s+/g, '');
- console.log(` ${base64Clean.length} base64 characters`);
- // ─── Step 2: Auto-discover and fix OCR errors ──────────────────
- console.log("\n[2/6] Auto-discovering and fixing OCR errors...");
- const autoResult = autoDiscoverAndFixOCRErrors(base64Clean);
- let base64Buf = autoResult.base64Buf;
- let pdf = autoResult.pdf;
- let pdfStr = autoResult.pdfStr;
- console.log(` ${autoResult.fixCount} auto-discovered fixes applied`);
- // ─── Step 3: Analyze streams ────────────────────────────────────
- console.log("\n[3/6] Analyzing streams...");
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- let allStreams = findAllStreams(pdfStr);
- let flatStreams = allStreams.filter(s => s.hasFilter);
- const nonFlatStreams = allStreams.filter(s => !s.hasFilter);
- let okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;
- console.log(` Found ${allStreams.length} streams total`);
- console.log(` ${flatStreams.length} FlateDecode (compressed), ${okCount} decompress OK`);
- console.log(` ${nonFlatStreams.length} uncompressed (skipped)`);
- const pageContentObjs = findPageContentObjects(pdfStr);
- // ─── Step 4: Fix Adler-32 checksums ─────────────────────────────
- console.log("\n[4/6] Repairing Adler-32 checksums...");
- let checksumFixed = 0;
- for (const stream of flatStreams) {
- if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
- if (tryFixChecksum(pdf, stream)) {
- checksumFixed++;
- console.log(` Object ${stream.obj}: checksum fixed`);
- }
- }
- if (checksumFixed > 0) {
- base64Buf = Buffer.from(pdf.toString('base64'));
- pdfStr = pdf.toString('binary');
- console.log(` ${checksumFixed} streams fixed via checksum repair`);
- } else {
- console.log(" (no checksum-only errors found)");
- }
- // Refresh stream list
- allStreams = findAllStreams(pdfStr);
- flatStreams = allStreams.filter(s => s.hasFilter);
- okCount = flatStreams.filter(s => canInflate(pdf.subarray(s.start, s.end))).length;
- console.log(` Status: ${okCount}/${flatStreams.length} compressed streams OK`);
- // ─── Step 5: Stream repair ──────────────────────────────────────
- console.log("\n[5/6] Stream repair...");
- const failedStreams = flatStreams
- .filter(s => !canInflate(pdf.subarray(s.start, s.end)))
- .sort((a, b) => {
- const aPage = pageContentObjs.includes(a.obj);
- const bPage = pageContentObjs.includes(b.obj);
- if (aPage && !bPage) return -1;
- if (!aPage && bPage) return 1;
- return a.size - b.size;
- });
- console.log(` ${failedStreams.length} streams to repair\n`);
- let totalFixed = checksumFixed;
- for (const stream of failedStreams) {
- // Re-decode PDF to get current state (only once per stream, not per trial)
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- const currentStream = findObjStream(pdfStr, stream.obj) ||
- findAllStreams(pdfStr).find(s => s.obj === stream.obj);
- if (!currentStream) continue;
- // Extract stream data for in-place manipulation
- const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
- if (canInflate(streamData)) continue;
- const isPage = pageContentObjs.includes(stream.obj);
- const label = isPage ? '[PAGE]' : ' ';
- process.stdout.write(` Obj ${String(stream.obj).padStart(2)} ${label} (${currentStream.size}b): `);
- // Phase A: Position-guided search with confusion pairs (memory-efficient)
- const guided = positionGuidedSearch(base64Buf, streamData, currentStream);
- if (guided.fixed) {
- totalFixed++;
- console.log(`FIXED [guided] ${guided.desc}`);
- continue;
- }
- // Phase B: Brute-force l→1 (memory-efficient)
- const maxBfDepth = currentStream.size < 3000 ? 4 : 2;
- const bf = bruteForceL1(base64Buf, streamData, currentStream, maxBfDepth);
- if (bf.fixed) {
- totalFixed++;
- console.log(`FIXED [brute-force] ${bf.changes}x l->1 (depth ${bf.depth})`);
- continue;
- }
- // Phase C: Iterative error fixing (greedy, one error at a time)
- const maxIter = currentStream.size < 5000 ? 15 : 10;
- const window = currentStream.size < 5000 ? 300 : 200;
- const iterResult = iterativeErrorFix(base64Buf, streamData, currentStream, maxIter, window);
- if (iterResult.fixed) {
- totalFixed++;
- console.log(`FIXED [iterative] ${iterResult.desc}`);
- continue;
- }
- // Not fixed
- const err = getError(streamData);
- console.log(`not fixed (${err.substring(0, 45)}) ${iterResult.desc}`);
- }
- // ─── Step 5b: Post-repair checksum pass ─────────────────────────
- // Iterative fixes corrected data errors in b64Buf but checksum
- // patches were only applied to local streamData buffers. Re-decode
- // and repair checksums on all remaining failing streams.
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- console.log("\n Post-repair checksum pass...");
- let postChecksumFixed = 0;
- const refreshedStreams = findAllStreams(pdfStr).filter(s => s.hasFilter);
- for (const stream of refreshedStreams) {
- if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
- if (tryFixChecksum(pdf, stream)) {
- postChecksumFixed++;
- totalFixed++;
- }
- }
- if (postChecksumFixed > 0) {
- console.log(` ${postChecksumFixed} additional streams fixed via checksum repair`);
- pdfStr = pdf.toString('binary');
- } else {
- console.log(" (no additional checksum fixes)");
- }
- // ─── Step 5c: Second pass with wider search for remaining ──────
- const stillFailing = findAllStreams(pdfStr)
- .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
- if (stillFailing.length > 0) {
- // Re-sync base64Buf with the checksum-repaired pdf
- base64Buf = Buffer.from(pdf.toString('base64'));
- console.log(`\n Second pass (wider search) for ${stillFailing.length} remaining streams...\n`);
- for (const stream of stillFailing) {
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- const currentStream = findObjStream(pdfStr, stream.obj) ||
- findAllStreams(pdfStr).find(s => s.obj === stream.obj);
- if (!currentStream) continue;
- const streamData = Buffer.from(pdf.subarray(currentStream.start, currentStream.end));
- if (canInflate(streamData)) continue;
- process.stdout.write(` Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);
- if (currentStream.size > 15000) {
- console.log(`skipped (too large for DFS)`);
- continue;
- }
- // DFS with backtracking — explores alternative fixes when greedy gets stuck
- const dfsResult = dfsErrorSearch(base64Buf, streamData, currentStream, 12, 2, 300);
- if (dfsResult.fixed) {
- totalFixed++;
- console.log(`FIXED [DFS] ${dfsResult.desc}`);
- } else {
- console.log(`not fixed ${dfsResult.desc}`);
- }
- }
- // Final checksum repair for any new fixes from second pass
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- for (const stream of findAllStreams(pdfStr).filter(s => s.hasFilter)) {
- if (canInflate(pdf.subarray(stream.start, stream.end))) continue;
- if (tryFixChecksum(pdf, stream)) { totalFixed++; postChecksumFixed++; }
- }
- pdfStr = pdf.toString('binary');
- }
- // ─── Step 5d: Rust-accelerated repair for remaining streams ─────
- const rustFailing = findAllStreams(pdfStr)
- .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
- if (rustFailing.length > 0) {
- const rustBin = new URL("stream_fixer/target/release/stream_fixer", import.meta.url).pathname;
- const hasTool = existsSync(rustBin);
- if (hasTool) {
- console.log(`\n Rust-accelerated repair for ${rustFailing.length} remaining streams...\n`);
- // Write current b64 state to a temp file
- base64Buf = Buffer.from(pdf.toString('base64'));
- const tmpB64 = `/tmp/pdf_recover_b64_${process.pid}.txt`;
- writeFileSync(tmpB64, base64Buf);
- for (const stream of rustFailing) {
- const currentStream = findObjStream(pdfStr, stream.obj) ||
- findAllStreams(pdfStr).find(s => s.obj === stream.obj);
- if (!currentStream) continue;
- process.stdout.write(` Obj ${String(stream.obj).padStart(2)} (${currentStream.size}b): `);
- const result = spawnSync(rustBin, [
- tmpB64, String(currentStream.start), String(currentStream.end),
- "--max-iter", "30", "--max-nodes", "100000",
- "--window", "600", "--branch", "3", "--max-depth", "20"
- ], { encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], timeout: 300000 });
- if (result.status !== 0) {
- console.log(`error (${(result.stderr || "").substring(0, 60)})`);
- continue;
- }
- try {
- const output = JSON.parse(result.stdout);
- if (output.fixed && output.changes && output.changes.length > 0) {
- // Apply the b64 changes
- for (const ch of output.changes) {
- const pos = ch.b64_pos;
- base64Buf[pos] = ch.new_char.charCodeAt(0);
- }
- // Re-decode PDF and update
- pdf = Buffer.from(base64Buf.toString(), 'base64');
- pdfStr = pdf.toString('binary');
- // Write updated b64 for next stream
- writeFileSync(tmpB64, base64Buf);
- // Checksum repair on this stream
- const fixed = findObjStream(pdfStr, stream.obj) ||
- findAllStreams(pdfStr).find(s => s.obj === stream.obj);
- if (fixed && fixed.hasFilter && !canInflate(pdf.subarray(fixed.start, fixed.end))) {
- if (tryFixChecksum(pdf, fixed)) {
- pdfStr = pdf.toString('binary');
- base64Buf = Buffer.from(pdf.toString('base64'));
- writeFileSync(tmpB64, base64Buf);
- }
- }
- const nowOk = fixed && canInflate(pdf.subarray(fixed!.start, fixed!.end));
- totalFixed += nowOk ? 1 : 0;
- console.log(`${nowOk ? 'FIXED' : 'partial'} [Rust] ${output.desc}`);
- } else if (output.fixed) {
- console.log(`already OK [Rust] ${output.desc}`);
- } else {
- console.log(`not fixed [Rust] ${output.desc}`);
- }
- } catch (e) {
- console.log(`parse error: ${result.stdout.substring(0, 60)}`);
- }
- }
- // Clean up temp file
- try { unlinkSync(tmpB64); } catch {}
- } else {
- console.log(`\n (Rust tool not found at ${rustBin} — skipping accelerated repair)`);
- console.log(` Build with: cd stream_fixer && cargo build --release`);
- }
- }
- // ─── Step 5e: Stream-end (BFINAL) repair for remaining streams ──
- const bfinalFailing = findAllStreams(pdfStr)
- .filter(s => s.hasFilter && !canInflate(pdf.subarray(s.start, s.end)));
- if (bfinalFailing.length > 0) {
- console.log(`\n Stream-end repair for ${bfinalFailing.length} remaining streams...\n`);
- for (const stream of bfinalFailing) {
- process.stdout.write(` Obj ${String(stream.obj).padStart(2)} (${stream.size}b): `);
- if (tryFixStreamEnd(pdf, stream)) {
- totalFixed++;
- pdfStr = pdf.toString('binary');
- console.log('FIXED [BFINAL]');
- } else {
- console.log('not fixed');
- }
- }
- }
- // ─── Step 5f: Visual repair (ObjStm alignment, color, content) ──
- console.log('\n Visual repair...');
- pdfStr = fixObjStmAlignment(pdf, pdfStr);
- pdfStr = truncateGarbledContentStreams(pdf, pdfStr);
- pdfStr = replaceColorSpacesWithCMYK(pdf, pdfStr);
- console.log(' done.');
- // ─── Step 6: Final output ───────────────────────────────────────
- console.log("\n" + "=".repeat(64));
- console.log(" RESULTS");
- console.log("=".repeat(64));
- // Per-object detection for accurate results
- let finalOk = 0;
- let finalTotal = 0;
- const stillFailingList: { obj: number; size: number; err: string }[] = [];
- for (const origStream of flatStreams) {
- const s = findObjStream(pdfStr, origStream.obj) ||
- findAllStreams(pdfStr).find(x => x.obj === origStream.obj);
- if (!s || !s.hasFilter) continue;
- finalTotal++;
- const data = pdf.subarray(s.start, s.end);
- if (canInflate(data)) {
- finalOk++;
- } else {
- stillFailingList.push({ obj: s.obj, size: s.size, err: getError(data) });
- }
- }
- console.log(`\nCompressed streams: ${finalOk}/${finalTotal} OK`);
- console.log(`Total streams fixed: ${totalFixed}`);
- if (pageContentObjs.length > 0) {
- let pageOk = 0;
- console.log("\nPage content streams:");
- for (const objNum of pageContentObjs) {
- const s = findObjStream(pdfStr, objNum);
- if (s && s.hasFilter) {
- const ok = canInflate(pdf.subarray(s.start, s.end));
- if (ok) pageOk++;
- console.log(` Object ${objNum}: ${ok ? 'OK' : 'FAIL'}`);
- }
- }
- console.log(`Page content: ${pageOk}/${pageContentObjs.length} OK`);
- }
- if (stillFailingList.length > 0) {
- console.log(`\nStill failing (${stillFailingList.length}):`);
- for (const s of stillFailingList) {
- console.log(` Obj ${s.obj}: ${s.size}b - ${s.err.substring(0, 50)}`);
- }
- }
- // Save raw repaired PDF
- const rawFile = outputFile.replace(/\.pdf$/, "-raw.pdf");
- writeFileSync(rawFile, pdf);
- console.log(`\nSaved raw: ${rawFile}`);
- // Extract text and re-distill
- console.log("\n[6/6] Extracting text and re-distilling...");
- let rawText = extractText(rawFile);
- let rawLines = rawText.split('\n').filter((l: string) => l.trim());
- console.log(` Raw PDF text: ${rawLines.length} non-empty lines`);
- if (runGhostscript(rawFile, outputFile)) {
- console.log(` Saved: ${outputFile}`);
- const distilledText = extractText(outputFile);
- const distilledLines = distilledText.split('\n').filter((l: string) => l.trim());
- console.log(` Re-distilled text: ${distilledLines.length} non-empty lines`);
- const bestText = distilledLines.length >= rawLines.length ? distilledText : rawText;
- const bestLines = bestText.split('\n').filter((l: string) => l.trim());
- const bestSource = distilledLines.length >= rawLines.length ? 'distilled' : 'raw';
- console.log(`\n--- Extracted Text (${bestSource}, first 40 lines) ---`);
- if (bestLines.length > 0) {
- console.log(bestLines.slice(0, 40).join('\n'));
- console.log(`\n(${bestLines.length} total non-empty lines)`);
- } else {
- console.log("(no text extracted)");
- }
- } else {
- console.log(` Ghostscript re-distill failed, using raw PDF`);
- writeFileSync(outputFile, pdf);
- }
- // Keep raw file for debugging
- // if (existsSync(rawFile) && existsSync(outputFile) && rawFile !== outputFile) {
- // unlinkSync(rawFile);
- // }
- }
- main().catch(console.error);
Add Comment
Please, Sign In to add comment