Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import fitz # PyMuPDF
- from pathlib import Path
- from time import monotonic
- from os import cpu_count
- from concurrent.futures import ProcessPoolExecutor
- from io import BytesIO
- import re # For regex matching of .htm filenames
- SOURCE_DIR = Path("E:/Desktop/New folder (5)/New folder (4)")
- TARGET_DIR = Path("E:/Desktop/New folder (5)/New folder (5)")
- def process(path: Path) -> tuple[float, int]:
- """Process a single PDF file"""
- print(f"Processing {path.name}")
- try:
- with fitz.open(path) as pdf:
- start = monotonic()
- total_pages = pdf.page_count
- for i in range(total_pages):
- page = pdf.load_page(i)
- pix = page.get_pixmap(matrix=fitz.Matrix(9, 9))
- # Extract text from the page
- page_text = page.get_text()
- # Search for the .htm name in the text
- match = re.search(r'(\d+)\.htm', page_text)
- if match:
- htm_name = match.group(1) # Get the number before .htm
- # Set the image path based on the .htm name
- img_path = TARGET_DIR / f"{htm_name}.png"
- with open(img_path, "wb") as f:
- img_buffer = BytesIO(pix.tobytes("png"))
- f.write(img_buffer.getvalue())
- else:
- print(f"No .htm name found on page {i+1} of {path.name}")
- return (monotonic() - start, total_pages)
- except Exception as e:
- print(f"Error processing {path.name}: {e}")
- return (0.0, 0)
- def main() -> None:
- TARGET_DIR.mkdir(parents=True, exist_ok=True)
- pdf_files = list(SOURCE_DIR.glob("*.pdf"))
- num_jobs = min(len(pdf_files), cpu_count() * 2)
- with ProcessPoolExecutor(max_workers=num_jobs) as executor:
- executor.map(process, pdf_files)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment