Untitled

import fitz  # PyMuPDF
from pathlib import Path
from time import monotonic
from os import cpu_count
from concurrent.futures import ProcessPoolExecutor
from io import BytesIO
import re  # For regex matching of .htm filenames

SOURCE_DIR = Path("E:/Desktop/New folder (5)/New folder (4)")
TARGET_DIR = Path("E:/Desktop/New folder (5)/New folder (5)")

def process(path: Path) -> tuple[float, int]:
    """Process a single PDF file"""
    print(f"Processing {path.name}")
    try:
        with fitz.open(path) as pdf:
            start = monotonic()
            total_pages = pdf.page_count

            for i in range(total_pages):
                page = pdf.load_page(i)
                pix = page.get_pixmap(matrix=fitz.Matrix(9, 9))

                # Extract text from the page
                page_text = page.get_text()

                # Search for the .htm name in the text
                match = re.search(r'(\d+)\.htm', page_text)
                if match:
                    htm_name = match.group(1)  # Get the number before .htm

                    # Set the image path based on the .htm name
                    img_path = TARGET_DIR / f"{htm_name}.png"
                    with open(img_path, "wb") as f:
                        img_buffer = BytesIO(pix.tobytes("png"))
                        f.write(img_buffer.getvalue())

                else:
                    print(f"No .htm name found on page {i+1} of {path.name}")

            return (monotonic() - start, total_pages)
    except Exception as e:
        print(f"Error processing {path.name}: {e}")
    return (0.0, 0)

def main() -> None:
    TARGET_DIR.mkdir(parents=True, exist_ok=True)
    pdf_files = list(SOURCE_DIR.glob("*.pdf"))

    num_jobs = min(len(pdf_files), cpu_count() * 2)

    with ProcessPoolExecutor(max_workers=num_jobs) as executor:
        executor.map(process, pdf_files)

if __name__ == "__main__":
    main()