Untitled

-------------------------
Cargo.toml
-------------------------
version = "0.1.0"
edition = "2021"

[dependencies]
libz-sys = { version = "1", features = ["static"] }
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1"

[profile.release]
opt-level = 3
lto = true
-------------------------

-------------------------
main.rs
-------------------------
//! Fast zlib stream fixer using Rust + Rayon parallelism.
//!
//! Takes a base64 file + stream byte offsets, finds b64 character changes
//! that make the zlib stream decompress. Uses raw inflate (ignoring checksum)
//! so the caller can do Adler-32 repair afterward.
//!
//! Usage: stream_fixer <b64_file> <stream_start> <stream_end> [options]
//! Options:
//!   --max-iter N     Max greedy iterations (default 30)
//!   --max-nodes N    Max DFS nodes (default 100000)
//!   --window N       Search window in b64 chars (default 600)
//!   --branch N       DFS branching factor (default 3)
//!   --max-depth N    DFS max depth (default 20)

use libz_sys::*;
use rayon::prelude::*;
use serde::Serialize;
use std::mem::MaybeUninit;
use std::{env, fs, process};
use std::os::raw::c_int;

// ── Base64 ──────────────────────────────────────────────────────────

const B64: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

fn b64_table() -> [u8; 128] {
    let mut t = [0u8; 128];
    for (i, &c) in B64.iter().enumerate() {
        t[c as usize] = i as u8;
    }
    t
}

#[inline]
fn decode_group(b64: &[u8], pos: usize, t: &[u8; 128]) -> [u8; 3] {
    let a = t[b64[pos] as usize] as u32;
    let b = t[b64[pos + 1] as usize] as u32;
    let c = t[b64[pos + 2] as usize] as u32;
    let d = t[b64[pos + 3] as usize] as u32;
    [
        ((a << 2) | (b >> 4)) as u8,
        (((b & 0xF) << 4) | (c >> 2)) as u8,
        (((c & 3) << 6) | d) as u8,
    ]
}

#[inline]
fn pdf_to_b64(pdf_pos: usize) -> usize {
    (pdf_pos / 3) * 4
}

fn decode_range(b64: &[u8], start: usize, end: usize, t: &[u8; 128]) -> Vec<u8> {
    let b64_start = pdf_to_b64(start);
    let b64_end = (pdf_to_b64(end) + 4).min(b64.len());
    let mut decoded = Vec::with_capacity((b64_end - b64_start) / 4 * 3 + 3);
    let mut pos = b64_start;
    while pos + 3 < b64.len() && pos < b64_end {
        let g = decode_group(b64, pos, t);
        decoded.extend_from_slice(&g);
        pos += 4;
    }
    let offset = start - (b64_start / 4) * 3;
    let len = end - start;
    let actual_len = len.min(decoded.len().saturating_sub(offset));
    decoded[offset..offset + actual_len].to_vec()
}

// ── Inflate helpers (raw libz via MaybeUninit for safety) ───────────

/// Create a zeroed z_stream via MaybeUninit (avoids the zeroed() panic on fn ptrs).
/// Returns a raw pointer to a stack-allocated z_stream. Caller must inflateEnd.
macro_rules! zstream_init {
    ($strm:ident) => {
        let mut $strm = MaybeUninit::<z_stream>::uninit();
        std::ptr::write_bytes($strm.as_mut_ptr(), 0, 1);
        let $strm = $strm.as_mut_ptr();
    };
}

/// Test if data decompresses as zlib (header + checksum).
fn can_inflate(data: &[u8]) -> bool {
    unsafe {
        zstream_init!(strm);
        if inflateInit_(strm, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
            return false;
        }
        (*strm).next_in = data.as_ptr() as *mut _;
        (*strm).avail_in = data.len() as u32;
        let mut out = [0u8; 65536];
        loop {
            (*strm).next_out = out.as_mut_ptr();
            (*strm).avail_out = out.len() as u32;
            let ret = inflate(strm, Z_FINISH);
            match ret {
                Z_STREAM_END => { inflateEnd(strm); return true; }
                Z_OK => continue,
                _ => { inflateEnd(strm); return false; }
            }
        }
    }
}

/// Test if raw deflate data decompresses. Tries stripping 0/2/4 bytes
/// from the end (checksum area) to avoid false negatives from checksum corruption.
fn can_inflate_raw(data: &[u8]) -> bool {
    if data.len() < 6 { return false; }
    for trim in [4usize, 2, 0] {
        let end = data.len().saturating_sub(trim);
        if end <= 2 { continue; }
        if try_raw_inflate(&data[2..end]) { return true; }
    }
    false
}

fn try_raw_inflate(raw: &[u8]) -> bool {
    if raw.is_empty() { return false; }
    unsafe {
        zstream_init!(strm);
        if inflateInit2_(strm, -15, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
            return false;
        }
        (*strm).next_in = raw.as_ptr() as *mut _;
        (*strm).avail_in = raw.len() as u32;
        let mut out = [0u8; 65536];
        loop {
            (*strm).next_out = out.as_mut_ptr();
            (*strm).avail_out = out.len() as u32;
            let ret = inflate(strm, Z_FINISH);
            match ret {
                Z_STREAM_END => { inflateEnd(strm); return true; }
                Z_OK => continue,
                _ => { inflateEnd(strm); return false; }
            }
        }
    }
}

/// Check if error is in the checksum area (last 6 bytes). If so, the data
/// is likely OK and just needs Adler-32 repair.
fn is_checksum_error(data: &[u8]) -> bool {
    let err = find_corruption_offset(data);
    err + 6 >= data.len()
}

/// Test if a prefix of raw deflate data inflates without data error.
fn try_raw_prefix(raw: &[u8], len: usize) -> bool {
    unsafe {
        zstream_init!(strm);
        if inflateInit2_(strm, -15, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
            return false;
        }
        (*strm).next_in = raw.as_ptr() as *mut _;
        (*strm).avail_in = len as u32;
        let mut out = [0u8; 65536];
        loop {
            (*strm).next_out = out.as_mut_ptr();
            (*strm).avail_out = out.len() as u32;
            let ret = inflate(strm, Z_SYNC_FLUSH);
            match ret {
                Z_OK => {
                    if (*strm).avail_in == 0 { inflateEnd(strm); return true; }
                }
                Z_STREAM_END => { inflateEnd(strm); return true; }
                Z_BUF_ERROR => { inflateEnd(strm); return true; }
                _ => { inflateEnd(strm); return false; }
            }
        }
    }
}

/// Binary search for the corruption offset in a zlib stream.
fn find_corruption_offset(data: &[u8]) -> usize {
    if data.len() < 3 { return 0; }
    let raw = &data[2..];
    let mut lo = 0usize;
    let mut hi = raw.len();
    while lo + 1 < hi {
        let mid = (lo + hi) / 2;
        if try_raw_prefix(raw, mid) {
            lo = mid;
        } else {
            hi = mid;
        }
    }
    lo + 2
}

/// Check if a change passes the known error position.
fn passes_err_pos(data: &[u8], err_pos: usize) -> bool {
    if err_pos < 3 || data.len() < 3 { return false; }
    let test_len = (data.len() - 2).min(err_pos + 3);
    try_raw_prefix(&data[2..], test_len)
}

// ── Confusion pairs ─────────────────────────────────────────────────

fn confusion_subs(ch: u8) -> &'static [u8] {
    match ch {
        b'l' => &[b'1', b'I'],
        b'1' => &[b'l', b'I'],
        b'I' => &[b'l', b'1'],
        b'O' => &[b'0'],
        b'0' => &[b'O'],
        b'5' => &[b'S'],
        b'S' => &[b'5'],
        b'8' => &[b'B'],
        b'B' => &[b'8'],
        _ => &[],
    }
}

// ── Candidate generation ────────────────────────────────────────────

#[derive(Clone)]
struct Change {
    b64_pos: usize,
    new_char: u8,
    /// (stream_data_offset, new_byte_value)
    byte_changes: Vec<(usize, u8)>,
}

fn generate_candidates(
    b64: &[u8], t: &[u8; 128],
    stream_start: usize, stream_end: usize,
    b64_lo: usize, b64_hi: usize,
    confusion_only: bool,
) -> Vec<Change> {
    let mut candidates = Vec::new();
    for b64_pos in b64_lo..b64_hi {
        if b64_pos + 3 >= b64.len() { break; }
        let orig = b64[b64_pos];
        let group_start = (b64_pos / 4) * 4;
        if group_start + 3 >= b64.len() { continue; }
        let pdf_start = (group_start / 4) * 3;
        let old_bytes = decode_group(b64, group_start, t);

        let subs = confusion_subs(orig);
        let chars_to_try: Vec<u8> = if confusion_only {
            subs.to_vec()
        } else {
            let mut v = subs.to_vec();
            for &c in B64 {
                if c != orig && !v.contains(&c) {
                    v.push(c);
                }
            }
            v
        };

        for new_char in chars_to_try {
            let idx_in_group = b64_pos - group_start;
            let mut group = [b64[group_start], b64[group_start+1], b64[group_start+2], b64[group_start+3]];
            group[idx_in_group] = new_char;
            let new_bytes = [
                ((t[group[0] as usize] as u32) << 2 | (t[group[1] as usize] as u32) >> 4) as u8,
                (((t[group[1] as usize] as u32) & 0xF) << 4 | (t[group[2] as usize] as u32) >> 2) as u8,
                (((t[group[2] as usize] as u32) & 3) << 6 | t[group[3] as usize] as u32) as u8,
            ];
            let mut byte_changes = Vec::new();
            for k in 0..3 {
                let off = pdf_start + k;
                if off >= stream_start && off < stream_end && old_bytes[k] != new_bytes[k] {
                    byte_changes.push((off - stream_start, new_bytes[k]));
                }
            }
            if byte_changes.is_empty() { continue; }
            candidates.push(Change { b64_pos, new_char, byte_changes });
        }
    }
    candidates
}

// ── Parallel evaluation ─────────────────────────────────────────────

/// Evaluate candidates in parallel. Returns (index, new_error_pos) pairs.
/// new_error_pos = usize::MAX means fully fixed.
fn eval_candidates(
    stream_data: &[u8],
    candidates: &[Change],
    err_pos: usize,
) -> Vec<(usize, usize)> {
    candidates.par_iter().enumerate().filter_map(|(idx, cand)| {
        let mut data = stream_data.to_vec();
        for &(off, val) in &cand.byte_changes {
            if off < data.len() { data[off] = val; }
        }
        if can_inflate_raw(&data) {
            return Some((idx, usize::MAX));
        }
        if passes_err_pos(&data, err_pos) {
            let new_err = find_corruption_offset(&data);
            if new_err > err_pos {
                return Some((idx, new_err));
            }
        }
        None
    }).collect()
}

// ── Iterative greedy search ─────────────────────────────────────────

fn iterative_search(
    b64: &mut Vec<u8>, t: &[u8; 128],
    stream_start: usize, stream_end: usize,
    max_iter: usize, window: usize,
) -> SearchResult {
    let mut stream_data = decode_range(b64, stream_start, stream_end, t);
    let mut changes: Vec<(usize, u8, u8)> = Vec::new();

    for iter in 0..max_iter {
        if can_inflate_raw(&stream_data) {
            return SearchResult::success(&changes, format!("{} fixes in {} iter", changes.len(), iter));
        }

        let err_pos = find_corruption_offset(&stream_data);
        // Error in checksum area = raw data is fixed, caller does Adler-32 repair
        if err_pos + 6 >= stream_data.len() {
            eprintln!();
            return SearchResult::success(&changes,
                format!("{} fixes + checksum needed", changes.len()));
        }

        let b64_center = pdf_to_b64(stream_start + err_pos);
        let b64_lo = pdf_to_b64(stream_start).max(b64_center.saturating_sub(window));
        let b64_hi = (pdf_to_b64(stream_end) + 4).min(b64_center + 20).min(b64.len());

        let candidates = generate_candidates(b64, t, stream_start, stream_end, b64_lo, b64_hi, false);
        if candidates.is_empty() { break; }

        let results = eval_candidates(&stream_data, &candidates, err_pos);
        if results.is_empty() { break; }

        let &(best_idx, best_err) = results.iter().max_by_key(|r| r.1).unwrap();
        let cand = &candidates[best_idx];

        let old_char = b64[cand.b64_pos];
        b64[cand.b64_pos] = cand.new_char;
        for &(off, val) in &cand.byte_changes {
            stream_data[off] = val;
        }
        changes.push((cand.b64_pos, old_char, cand.new_char));

        eprint!("\r  iter {}: err {} -> {}, b64[{}] '{}' -> '{}'",
            iter, err_pos,
            if best_err == usize::MAX { "OK".to_string() } else { best_err.to_string() },
            cand.b64_pos, old_char as char, cand.new_char as char);

        if best_err == usize::MAX {
            eprintln!();
            return SearchResult::success(&changes, format!("{} fixes in {} iter", changes.len(), iter + 1));
        }
    }

    // Final check
    if can_inflate_raw(&stream_data) {
        eprintln!();
        return SearchResult::success(&changes, format!("{} fixes", changes.len()));
    }

    // Revert
    for &(pos, old, _) in changes.iter().rev() {
        b64[pos] = old;
    }
    eprintln!();
    SearchResult::fail(format!("iterative: {} partial reverted", changes.len()))
}

// ── DFS with backtracking ───────────────────────────────────────────

struct DfsCtx<'a> {
    b64: &'a mut Vec<u8>,
    t: &'a [u8; 128],
    stream_data: Vec<u8>,
    stream_start: usize,
    stream_end: usize,
    applied: Vec<(usize, u8, u8, Vec<(usize, u8)>)>, // (b64_pos, old_char, new_char, old_bytes)
    node_count: usize,
    max_nodes: usize,
    max_depth: usize,
    window: usize,
    branch: usize,
}

impl<'a> DfsCtx<'a> {
    fn search(&mut self, depth: usize) -> bool {
        if can_inflate_raw(&self.stream_data) { return true; }
        if depth >= self.max_depth || self.node_count >= self.max_nodes { return false; }
        self.node_count += 1;

        let err_pos = find_corruption_offset(&self.stream_data);
        // Error in checksum area = raw data is OK
        if err_pos + 6 >= self.stream_data.len() { return true; }

        let b64_center = pdf_to_b64(self.stream_start + err_pos);

        // Pass 1: confusion pairs in wide window
        let b64_lo = pdf_to_b64(self.stream_start).max(b64_center.saturating_sub(self.window));
        let b64_hi = (pdf_to_b64(self.stream_end) + 4).min(b64_center + 20).min(self.b64.len());
        let mut candidates = generate_candidates(self.b64, self.t, self.stream_start, self.stream_end, b64_lo, b64_hi, true);

        // Pass 2: all chars in tight window if no confusion candidates
        if candidates.is_empty() {
            let tight_lo = pdf_to_b64(self.stream_start).max(b64_center.saturating_sub(60));
            let tight_hi = (pdf_to_b64(self.stream_end) + 4).min(b64_center + 10).min(self.b64.len());
            candidates = generate_candidates(self.b64, self.t, self.stream_start, self.stream_end, tight_lo, tight_hi, false);
        }
        if candidates.is_empty() { return false; }

        let results = eval_candidates(&self.stream_data, &candidates, err_pos);
        if results.is_empty() { return false; }

        let mut sorted: Vec<(usize, usize)> = results;
        sorted.sort_by(|a, b| b.1.cmp(&a.1));

        for &(idx, improvement) in sorted.iter().take(self.branch) {
            if self.node_count >= self.max_nodes { break; }

            let cand = candidates[idx].clone();
            let old_char = self.b64[cand.b64_pos];

            // Save old bytes for revert
            let old_bytes: Vec<(usize, u8)> = cand.byte_changes.iter()
                .map(|&(off, _)| (off, self.stream_data[off]))
                .collect();

            // Apply
            self.b64[cand.b64_pos] = cand.new_char;
            for &(off, val) in &cand.byte_changes {
                self.stream_data[off] = val;
            }
            self.applied.push((cand.b64_pos, old_char, cand.new_char, old_bytes.clone()));

            if improvement == usize::MAX || self.search(depth + 1) {
                return true;
            }

            // Revert
            let (pos, old, _, bytes) = self.applied.pop().unwrap();
            self.b64[pos] = old;
            for (off, val) in bytes {
                self.stream_data[off] = val;
            }
        }

        false
    }
}

fn dfs_search(
    b64: &mut Vec<u8>, t: &[u8; 128],
    stream_start: usize, stream_end: usize,
    max_nodes: usize, max_depth: usize, window: usize, branch: usize,
) -> SearchResult {
    let stream_data = decode_range(b64, stream_start, stream_end, t);
    if can_inflate_raw(&stream_data) {
        return SearchResult::success(&[], "already OK".into());
    }

    let mut ctx = DfsCtx {
        b64, t, stream_data, stream_start, stream_end,
        applied: Vec::new(),
        node_count: 0, max_nodes, max_depth, window, branch,
    };

    if ctx.search(0) {
        let changes: Vec<(usize, u8, u8)> = ctx.applied.iter()
            .map(|(pos, old, new, _)| (*pos, *old, *new))
            .collect();
        eprintln!("  DFS: {} fixes, {} nodes explored", changes.len(), ctx.node_count);
        SearchResult::success(&changes, format!("DFS {} fixes ({} nodes)", changes.len(), ctx.node_count))
    } else {
        // Revert (should already be reverted by backtracking, but just in case)
        for (pos, old, _, bytes) in ctx.applied.iter().rev() {
            ctx.b64[*pos] = *old;
            for &(off, val) in bytes {
                ctx.stream_data[off] = val;
            }
        }
        eprintln!("  DFS: exhausted ({} nodes)", ctx.node_count);
        SearchResult::fail(format!("DFS exhausted ({} nodes)", ctx.node_count))
    }
}

// ── Output ──────────────────────────────────────────────────────────

#[derive(Serialize)]
struct ChangeOutput {
    b64_pos: usize,
    old_char: String,
    new_char: String,
}

#[derive(Serialize)]
struct SearchResult {
    fixed: bool,
    changes: Vec<ChangeOutput>,
    desc: String,
}

impl SearchResult {
    fn success(changes: &[(usize, u8, u8)], desc: String) -> Self {
        SearchResult {
            fixed: true,
            changes: changes.iter().map(|&(pos, old, new)| ChangeOutput {
                b64_pos: pos,
                old_char: String::from(old as char),
                new_char: String::from(new as char),
            }).collect(),
            desc,
        }
    }
    fn fail(desc: String) -> Self {
        SearchResult { fixed: false, changes: vec![], desc }
    }
}

// ── Main ────────────────────────────────────────────────────────────

fn main() {
    // Rayon threads need enough stack for zlib's internal state + output buffers
    rayon::ThreadPoolBuilder::new()
        .stack_size(4 * 1024 * 1024) // 4MB per thread
        .build_global()
        .ok();

    let args: Vec<String> = env::args().collect();
    if args.len() < 4 {
        eprintln!("Usage: stream_fixer <b64_file> <stream_start> <stream_end> [options]");
        eprintln!("Options: --max-iter N --max-nodes N --window N --branch N --max-depth N");
        process::exit(1);
    }

    let b64_file = &args[1];
    let stream_start: usize = args[2].parse().expect("invalid stream_start");
    let stream_end: usize = args[3].parse().expect("invalid stream_end");

    // Parse options
    let mut max_iter = 30usize;
    let mut max_nodes = 100_000usize;
    let mut window = 600usize;
    let mut branch = 3usize;
    let mut max_depth = 20usize;

    let mut i = 4;
    while i < args.len() {
        match args[i].as_str() {
            "--max-iter" => { i += 1; max_iter = args[i].parse().unwrap(); }
            "--max-nodes" => { i += 1; max_nodes = args[i].parse().unwrap(); }
            "--window" => { i += 1; window = args[i].parse().unwrap(); }
            "--branch" => { i += 1; branch = args[i].parse().unwrap(); }
            "--max-depth" => { i += 1; max_depth = args[i].parse().unwrap(); }
            _ => { eprintln!("Unknown option: {}", args[i]); process::exit(1); }
        }
        i += 1;
    }

    eprintln!("stream_fixer: stream [{}, {}), {}b, window={}, max_iter={}, max_nodes={}, branch={}, depth={}",
        stream_start, stream_end, stream_end - stream_start,
        window, max_iter, max_nodes, branch, max_depth);

    // Read base64
    let raw = fs::read_to_string(b64_file).expect("cannot read b64 file");
    let clean: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
    let mut b64 = clean.into_bytes();
    let t = b64_table();

    // Check current state
    let stream_data = decode_range(&b64, stream_start, stream_end, &t);
    let err_pos = find_corruption_offset(&stream_data);
    let raw_ok = can_inflate_raw(&stream_data);
    let full_ok = can_inflate(&stream_data);
    eprintln!("  initial: raw_ok={}, full_ok={}, err_pos={}/{}", raw_ok, full_ok, err_pos, stream_data.len());

    if full_ok {
        let result = SearchResult::success(&[], "already OK".into());
        println!("{}", serde_json::to_string(&result).unwrap());
        return;
    }

    if raw_ok {
        let result = SearchResult::success(&[], "raw OK, needs checksum only".into());
        println!("{}", serde_json::to_string(&result).unwrap());
        return;
    }

    // Phase 1: Iterative greedy
    eprintln!("  Phase 1: iterative greedy...");
    let result = iterative_search(&mut b64, &t, stream_start, stream_end, max_iter, window);
    if result.fixed {
        println!("{}", serde_json::to_string(&result).unwrap());
        return;
    }

    // Phase 2: DFS with backtracking
    eprintln!("  Phase 2: DFS backtracking...");
    let result = dfs_search(&mut b64, &t, stream_start, stream_end, max_nodes, max_depth, window, branch);
    println!("{}", serde_json::to_string(&result).unwrap());
}