Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -------------------------
- Cargo.toml
- -------------------------
- version = "0.1.0"
- edition = "2021"
- [dependencies]
- libz-sys = { version = "1", features = ["static"] }
- rayon = "1"
- serde = { version = "1", features = ["derive"] }
- serde_json = "1"
- [profile.release]
- opt-level = 3
- lto = true
- -------------------------
- -------------------------
- main.rs
- -------------------------
- //! Fast zlib stream fixer using Rust + Rayon parallelism.
- //!
- //! Takes a base64 file + stream byte offsets, finds b64 character changes
- //! that make the zlib stream decompress. Uses raw inflate (ignoring checksum)
- //! so the caller can do Adler-32 repair afterward.
- //!
- //! Usage: stream_fixer <b64_file> <stream_start> <stream_end> [options]
- //! Options:
- //! --max-iter N Max greedy iterations (default 30)
- //! --max-nodes N Max DFS nodes (default 100000)
- //! --window N Search window in b64 chars (default 600)
- //! --branch N DFS branching factor (default 3)
- //! --max-depth N DFS max depth (default 20)
- use libz_sys::*;
- use rayon::prelude::*;
- use serde::Serialize;
- use std::mem::MaybeUninit;
- use std::{env, fs, process};
- use std::os::raw::c_int;
- // ── Base64 ──────────────────────────────────────────────────────────
- const B64: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
- fn b64_table() -> [u8; 128] {
- let mut t = [0u8; 128];
- for (i, &c) in B64.iter().enumerate() {
- t[c as usize] = i as u8;
- }
- t
- }
- #[inline]
- fn decode_group(b64: &[u8], pos: usize, t: &[u8; 128]) -> [u8; 3] {
- let a = t[b64[pos] as usize] as u32;
- let b = t[b64[pos + 1] as usize] as u32;
- let c = t[b64[pos + 2] as usize] as u32;
- let d = t[b64[pos + 3] as usize] as u32;
- [
- ((a << 2) | (b >> 4)) as u8,
- (((b & 0xF) << 4) | (c >> 2)) as u8,
- (((c & 3) << 6) | d) as u8,
- ]
- }
- #[inline]
- fn pdf_to_b64(pdf_pos: usize) -> usize {
- (pdf_pos / 3) * 4
- }
- fn decode_range(b64: &[u8], start: usize, end: usize, t: &[u8; 128]) -> Vec<u8> {
- let b64_start = pdf_to_b64(start);
- let b64_end = (pdf_to_b64(end) + 4).min(b64.len());
- let mut decoded = Vec::with_capacity((b64_end - b64_start) / 4 * 3 + 3);
- let mut pos = b64_start;
- while pos + 3 < b64.len() && pos < b64_end {
- let g = decode_group(b64, pos, t);
- decoded.extend_from_slice(&g);
- pos += 4;
- }
- let offset = start - (b64_start / 4) * 3;
- let len = end - start;
- let actual_len = len.min(decoded.len().saturating_sub(offset));
- decoded[offset..offset + actual_len].to_vec()
- }
- // ── Inflate helpers (raw libz via MaybeUninit for safety) ───────────
- /// Create a zeroed z_stream via MaybeUninit (avoids the zeroed() panic on fn ptrs).
- /// Returns a raw pointer to a stack-allocated z_stream. Caller must inflateEnd.
- macro_rules! zstream_init {
- ($strm:ident) => {
- let mut $strm = MaybeUninit::<z_stream>::uninit();
- std::ptr::write_bytes($strm.as_mut_ptr(), 0, 1);
- let $strm = $strm.as_mut_ptr();
- };
- }
- /// Test if data decompresses as zlib (header + checksum).
- fn can_inflate(data: &[u8]) -> bool {
- unsafe {
- zstream_init!(strm);
- if inflateInit_(strm, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
- return false;
- }
- (*strm).next_in = data.as_ptr() as *mut _;
- (*strm).avail_in = data.len() as u32;
- let mut out = [0u8; 65536];
- loop {
- (*strm).next_out = out.as_mut_ptr();
- (*strm).avail_out = out.len() as u32;
- let ret = inflate(strm, Z_FINISH);
- match ret {
- Z_STREAM_END => { inflateEnd(strm); return true; }
- Z_OK => continue,
- _ => { inflateEnd(strm); return false; }
- }
- }
- }
- }
- /// Test if raw deflate data decompresses. Tries stripping 0/2/4 bytes
- /// from the end (checksum area) to avoid false negatives from checksum corruption.
- fn can_inflate_raw(data: &[u8]) -> bool {
- if data.len() < 6 { return false; }
- for trim in [4usize, 2, 0] {
- let end = data.len().saturating_sub(trim);
- if end <= 2 { continue; }
- if try_raw_inflate(&data[2..end]) { return true; }
- }
- false
- }
- fn try_raw_inflate(raw: &[u8]) -> bool {
- if raw.is_empty() { return false; }
- unsafe {
- zstream_init!(strm);
- if inflateInit2_(strm, -15, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
- return false;
- }
- (*strm).next_in = raw.as_ptr() as *mut _;
- (*strm).avail_in = raw.len() as u32;
- let mut out = [0u8; 65536];
- loop {
- (*strm).next_out = out.as_mut_ptr();
- (*strm).avail_out = out.len() as u32;
- let ret = inflate(strm, Z_FINISH);
- match ret {
- Z_STREAM_END => { inflateEnd(strm); return true; }
- Z_OK => continue,
- _ => { inflateEnd(strm); return false; }
- }
- }
- }
- }
- /// Check if error is in the checksum area (last 6 bytes). If so, the data
- /// is likely OK and just needs Adler-32 repair.
- fn is_checksum_error(data: &[u8]) -> bool {
- let err = find_corruption_offset(data);
- err + 6 >= data.len()
- }
- /// Test if a prefix of raw deflate data inflates without data error.
- fn try_raw_prefix(raw: &[u8], len: usize) -> bool {
- unsafe {
- zstream_init!(strm);
- if inflateInit2_(strm, -15, zlibVersion(), std::mem::size_of::<z_stream>() as c_int) != Z_OK {
- return false;
- }
- (*strm).next_in = raw.as_ptr() as *mut _;
- (*strm).avail_in = len as u32;
- let mut out = [0u8; 65536];
- loop {
- (*strm).next_out = out.as_mut_ptr();
- (*strm).avail_out = out.len() as u32;
- let ret = inflate(strm, Z_SYNC_FLUSH);
- match ret {
- Z_OK => {
- if (*strm).avail_in == 0 { inflateEnd(strm); return true; }
- }
- Z_STREAM_END => { inflateEnd(strm); return true; }
- Z_BUF_ERROR => { inflateEnd(strm); return true; }
- _ => { inflateEnd(strm); return false; }
- }
- }
- }
- }
- /// Binary search for the corruption offset in a zlib stream.
- fn find_corruption_offset(data: &[u8]) -> usize {
- if data.len() < 3 { return 0; }
- let raw = &data[2..];
- let mut lo = 0usize;
- let mut hi = raw.len();
- while lo + 1 < hi {
- let mid = (lo + hi) / 2;
- if try_raw_prefix(raw, mid) {
- lo = mid;
- } else {
- hi = mid;
- }
- }
- lo + 2
- }
- /// Check if a change passes the known error position.
- fn passes_err_pos(data: &[u8], err_pos: usize) -> bool {
- if err_pos < 3 || data.len() < 3 { return false; }
- let test_len = (data.len() - 2).min(err_pos + 3);
- try_raw_prefix(&data[2..], test_len)
- }
- // ── Confusion pairs ─────────────────────────────────────────────────
- fn confusion_subs(ch: u8) -> &'static [u8] {
- match ch {
- b'l' => &[b'1', b'I'],
- b'1' => &[b'l', b'I'],
- b'I' => &[b'l', b'1'],
- b'O' => &[b'0'],
- b'0' => &[b'O'],
- b'5' => &[b'S'],
- b'S' => &[b'5'],
- b'8' => &[b'B'],
- b'B' => &[b'8'],
- _ => &[],
- }
- }
- // ── Candidate generation ────────────────────────────────────────────
- #[derive(Clone)]
- struct Change {
- b64_pos: usize,
- new_char: u8,
- /// (stream_data_offset, new_byte_value)
- byte_changes: Vec<(usize, u8)>,
- }
- fn generate_candidates(
- b64: &[u8], t: &[u8; 128],
- stream_start: usize, stream_end: usize,
- b64_lo: usize, b64_hi: usize,
- confusion_only: bool,
- ) -> Vec<Change> {
- let mut candidates = Vec::new();
- for b64_pos in b64_lo..b64_hi {
- if b64_pos + 3 >= b64.len() { break; }
- let orig = b64[b64_pos];
- let group_start = (b64_pos / 4) * 4;
- if group_start + 3 >= b64.len() { continue; }
- let pdf_start = (group_start / 4) * 3;
- let old_bytes = decode_group(b64, group_start, t);
- let subs = confusion_subs(orig);
- let chars_to_try: Vec<u8> = if confusion_only {
- subs.to_vec()
- } else {
- let mut v = subs.to_vec();
- for &c in B64 {
- if c != orig && !v.contains(&c) {
- v.push(c);
- }
- }
- v
- };
- for new_char in chars_to_try {
- let idx_in_group = b64_pos - group_start;
- let mut group = [b64[group_start], b64[group_start+1], b64[group_start+2], b64[group_start+3]];
- group[idx_in_group] = new_char;
- let new_bytes = [
- ((t[group[0] as usize] as u32) << 2 | (t[group[1] as usize] as u32) >> 4) as u8,
- (((t[group[1] as usize] as u32) & 0xF) << 4 | (t[group[2] as usize] as u32) >> 2) as u8,
- (((t[group[2] as usize] as u32) & 3) << 6 | t[group[3] as usize] as u32) as u8,
- ];
- let mut byte_changes = Vec::new();
- for k in 0..3 {
- let off = pdf_start + k;
- if off >= stream_start && off < stream_end && old_bytes[k] != new_bytes[k] {
- byte_changes.push((off - stream_start, new_bytes[k]));
- }
- }
- if byte_changes.is_empty() { continue; }
- candidates.push(Change { b64_pos, new_char, byte_changes });
- }
- }
- candidates
- }
- // ── Parallel evaluation ─────────────────────────────────────────────
- /// Evaluate candidates in parallel. Returns (index, new_error_pos) pairs.
- /// new_error_pos = usize::MAX means fully fixed.
- fn eval_candidates(
- stream_data: &[u8],
- candidates: &[Change],
- err_pos: usize,
- ) -> Vec<(usize, usize)> {
- candidates.par_iter().enumerate().filter_map(|(idx, cand)| {
- let mut data = stream_data.to_vec();
- for &(off, val) in &cand.byte_changes {
- if off < data.len() { data[off] = val; }
- }
- if can_inflate_raw(&data) {
- return Some((idx, usize::MAX));
- }
- if passes_err_pos(&data, err_pos) {
- let new_err = find_corruption_offset(&data);
- if new_err > err_pos {
- return Some((idx, new_err));
- }
- }
- None
- }).collect()
- }
- // ── Iterative greedy search ─────────────────────────────────────────
- fn iterative_search(
- b64: &mut Vec<u8>, t: &[u8; 128],
- stream_start: usize, stream_end: usize,
- max_iter: usize, window: usize,
- ) -> SearchResult {
- let mut stream_data = decode_range(b64, stream_start, stream_end, t);
- let mut changes: Vec<(usize, u8, u8)> = Vec::new();
- for iter in 0..max_iter {
- if can_inflate_raw(&stream_data) {
- return SearchResult::success(&changes, format!("{} fixes in {} iter", changes.len(), iter));
- }
- let err_pos = find_corruption_offset(&stream_data);
- // Error in checksum area = raw data is fixed, caller does Adler-32 repair
- if err_pos + 6 >= stream_data.len() {
- eprintln!();
- return SearchResult::success(&changes,
- format!("{} fixes + checksum needed", changes.len()));
- }
- let b64_center = pdf_to_b64(stream_start + err_pos);
- let b64_lo = pdf_to_b64(stream_start).max(b64_center.saturating_sub(window));
- let b64_hi = (pdf_to_b64(stream_end) + 4).min(b64_center + 20).min(b64.len());
- let candidates = generate_candidates(b64, t, stream_start, stream_end, b64_lo, b64_hi, false);
- if candidates.is_empty() { break; }
- let results = eval_candidates(&stream_data, &candidates, err_pos);
- if results.is_empty() { break; }
- let &(best_idx, best_err) = results.iter().max_by_key(|r| r.1).unwrap();
- let cand = &candidates[best_idx];
- let old_char = b64[cand.b64_pos];
- b64[cand.b64_pos] = cand.new_char;
- for &(off, val) in &cand.byte_changes {
- stream_data[off] = val;
- }
- changes.push((cand.b64_pos, old_char, cand.new_char));
- eprint!("\r iter {}: err {} -> {}, b64[{}] '{}' -> '{}'",
- iter, err_pos,
- if best_err == usize::MAX { "OK".to_string() } else { best_err.to_string() },
- cand.b64_pos, old_char as char, cand.new_char as char);
- if best_err == usize::MAX {
- eprintln!();
- return SearchResult::success(&changes, format!("{} fixes in {} iter", changes.len(), iter + 1));
- }
- }
- // Final check
- if can_inflate_raw(&stream_data) {
- eprintln!();
- return SearchResult::success(&changes, format!("{} fixes", changes.len()));
- }
- // Revert
- for &(pos, old, _) in changes.iter().rev() {
- b64[pos] = old;
- }
- eprintln!();
- SearchResult::fail(format!("iterative: {} partial reverted", changes.len()))
- }
- // ── DFS with backtracking ───────────────────────────────────────────
- struct DfsCtx<'a> {
- b64: &'a mut Vec<u8>,
- t: &'a [u8; 128],
- stream_data: Vec<u8>,
- stream_start: usize,
- stream_end: usize,
- applied: Vec<(usize, u8, u8, Vec<(usize, u8)>)>, // (b64_pos, old_char, new_char, old_bytes)
- node_count: usize,
- max_nodes: usize,
- max_depth: usize,
- window: usize,
- branch: usize,
- }
- impl<'a> DfsCtx<'a> {
- fn search(&mut self, depth: usize) -> bool {
- if can_inflate_raw(&self.stream_data) { return true; }
- if depth >= self.max_depth || self.node_count >= self.max_nodes { return false; }
- self.node_count += 1;
- let err_pos = find_corruption_offset(&self.stream_data);
- // Error in checksum area = raw data is OK
- if err_pos + 6 >= self.stream_data.len() { return true; }
- let b64_center = pdf_to_b64(self.stream_start + err_pos);
- // Pass 1: confusion pairs in wide window
- let b64_lo = pdf_to_b64(self.stream_start).max(b64_center.saturating_sub(self.window));
- let b64_hi = (pdf_to_b64(self.stream_end) + 4).min(b64_center + 20).min(self.b64.len());
- let mut candidates = generate_candidates(self.b64, self.t, self.stream_start, self.stream_end, b64_lo, b64_hi, true);
- // Pass 2: all chars in tight window if no confusion candidates
- if candidates.is_empty() {
- let tight_lo = pdf_to_b64(self.stream_start).max(b64_center.saturating_sub(60));
- let tight_hi = (pdf_to_b64(self.stream_end) + 4).min(b64_center + 10).min(self.b64.len());
- candidates = generate_candidates(self.b64, self.t, self.stream_start, self.stream_end, tight_lo, tight_hi, false);
- }
- if candidates.is_empty() { return false; }
- let results = eval_candidates(&self.stream_data, &candidates, err_pos);
- if results.is_empty() { return false; }
- let mut sorted: Vec<(usize, usize)> = results;
- sorted.sort_by(|a, b| b.1.cmp(&a.1));
- for &(idx, improvement) in sorted.iter().take(self.branch) {
- if self.node_count >= self.max_nodes { break; }
- let cand = candidates[idx].clone();
- let old_char = self.b64[cand.b64_pos];
- // Save old bytes for revert
- let old_bytes: Vec<(usize, u8)> = cand.byte_changes.iter()
- .map(|&(off, _)| (off, self.stream_data[off]))
- .collect();
- // Apply
- self.b64[cand.b64_pos] = cand.new_char;
- for &(off, val) in &cand.byte_changes {
- self.stream_data[off] = val;
- }
- self.applied.push((cand.b64_pos, old_char, cand.new_char, old_bytes.clone()));
- if improvement == usize::MAX || self.search(depth + 1) {
- return true;
- }
- // Revert
- let (pos, old, _, bytes) = self.applied.pop().unwrap();
- self.b64[pos] = old;
- for (off, val) in bytes {
- self.stream_data[off] = val;
- }
- }
- false
- }
- }
- fn dfs_search(
- b64: &mut Vec<u8>, t: &[u8; 128],
- stream_start: usize, stream_end: usize,
- max_nodes: usize, max_depth: usize, window: usize, branch: usize,
- ) -> SearchResult {
- let stream_data = decode_range(b64, stream_start, stream_end, t);
- if can_inflate_raw(&stream_data) {
- return SearchResult::success(&[], "already OK".into());
- }
- let mut ctx = DfsCtx {
- b64, t, stream_data, stream_start, stream_end,
- applied: Vec::new(),
- node_count: 0, max_nodes, max_depth, window, branch,
- };
- if ctx.search(0) {
- let changes: Vec<(usize, u8, u8)> = ctx.applied.iter()
- .map(|(pos, old, new, _)| (*pos, *old, *new))
- .collect();
- eprintln!(" DFS: {} fixes, {} nodes explored", changes.len(), ctx.node_count);
- SearchResult::success(&changes, format!("DFS {} fixes ({} nodes)", changes.len(), ctx.node_count))
- } else {
- // Revert (should already be reverted by backtracking, but just in case)
- for (pos, old, _, bytes) in ctx.applied.iter().rev() {
- ctx.b64[*pos] = *old;
- for &(off, val) in bytes {
- ctx.stream_data[off] = val;
- }
- }
- eprintln!(" DFS: exhausted ({} nodes)", ctx.node_count);
- SearchResult::fail(format!("DFS exhausted ({} nodes)", ctx.node_count))
- }
- }
- // ── Output ──────────────────────────────────────────────────────────
- #[derive(Serialize)]
- struct ChangeOutput {
- b64_pos: usize,
- old_char: String,
- new_char: String,
- }
- #[derive(Serialize)]
- struct SearchResult {
- fixed: bool,
- changes: Vec<ChangeOutput>,
- desc: String,
- }
- impl SearchResult {
- fn success(changes: &[(usize, u8, u8)], desc: String) -> Self {
- SearchResult {
- fixed: true,
- changes: changes.iter().map(|&(pos, old, new)| ChangeOutput {
- b64_pos: pos,
- old_char: String::from(old as char),
- new_char: String::from(new as char),
- }).collect(),
- desc,
- }
- }
- fn fail(desc: String) -> Self {
- SearchResult { fixed: false, changes: vec![], desc }
- }
- }
- // ── Main ────────────────────────────────────────────────────────────
- fn main() {
- // Rayon threads need enough stack for zlib's internal state + output buffers
- rayon::ThreadPoolBuilder::new()
- .stack_size(4 * 1024 * 1024) // 4MB per thread
- .build_global()
- .ok();
- let args: Vec<String> = env::args().collect();
- if args.len() < 4 {
- eprintln!("Usage: stream_fixer <b64_file> <stream_start> <stream_end> [options]");
- eprintln!("Options: --max-iter N --max-nodes N --window N --branch N --max-depth N");
- process::exit(1);
- }
- let b64_file = &args[1];
- let stream_start: usize = args[2].parse().expect("invalid stream_start");
- let stream_end: usize = args[3].parse().expect("invalid stream_end");
- // Parse options
- let mut max_iter = 30usize;
- let mut max_nodes = 100_000usize;
- let mut window = 600usize;
- let mut branch = 3usize;
- let mut max_depth = 20usize;
- let mut i = 4;
- while i < args.len() {
- match args[i].as_str() {
- "--max-iter" => { i += 1; max_iter = args[i].parse().unwrap(); }
- "--max-nodes" => { i += 1; max_nodes = args[i].parse().unwrap(); }
- "--window" => { i += 1; window = args[i].parse().unwrap(); }
- "--branch" => { i += 1; branch = args[i].parse().unwrap(); }
- "--max-depth" => { i += 1; max_depth = args[i].parse().unwrap(); }
- _ => { eprintln!("Unknown option: {}", args[i]); process::exit(1); }
- }
- i += 1;
- }
- eprintln!("stream_fixer: stream [{}, {}), {}b, window={}, max_iter={}, max_nodes={}, branch={}, depth={}",
- stream_start, stream_end, stream_end - stream_start,
- window, max_iter, max_nodes, branch, max_depth);
- // Read base64
- let raw = fs::read_to_string(b64_file).expect("cannot read b64 file");
- let clean: String = raw.chars().filter(|c| !c.is_whitespace()).collect();
- let mut b64 = clean.into_bytes();
- let t = b64_table();
- // Check current state
- let stream_data = decode_range(&b64, stream_start, stream_end, &t);
- let err_pos = find_corruption_offset(&stream_data);
- let raw_ok = can_inflate_raw(&stream_data);
- let full_ok = can_inflate(&stream_data);
- eprintln!(" initial: raw_ok={}, full_ok={}, err_pos={}/{}", raw_ok, full_ok, err_pos, stream_data.len());
- if full_ok {
- let result = SearchResult::success(&[], "already OK".into());
- println!("{}", serde_json::to_string(&result).unwrap());
- return;
- }
- if raw_ok {
- let result = SearchResult::success(&[], "raw OK, needs checksum only".into());
- println!("{}", serde_json::to_string(&result).unwrap());
- return;
- }
- // Phase 1: Iterative greedy
- eprintln!(" Phase 1: iterative greedy...");
- let result = iterative_search(&mut b64, &t, stream_start, stream_end, max_iter, window);
- if result.fixed {
- println!("{}", serde_json::to_string(&result).unwrap());
- return;
- }
- // Phase 2: DFS with backtracking
- eprintln!(" Phase 2: DFS backtracking...");
- let result = dfs_search(&mut b64, &t, stream_start, stream_end, max_nodes, max_depth, window, branch);
- println!("{}", serde_json::to_string(&result).unwrap());
- }
Add Comment
Please, Sign In to add comment