Guest User

Untitled

a guest
Mar 23rd, 2025
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.92 KB | None | 0 0
  1. import fitz # PyMuPDF
  2. from pathlib import Path
  3. from time import monotonic
  4. from os import cpu_count
  5. from concurrent.futures import ProcessPoolExecutor
  6. from io import BytesIO
  7. import re # For regex matching of .htm filenames
  8.  
  9. SOURCE_DIR = Path("E:/Desktop/New folder (5)/New folder (4)")
  10. TARGET_DIR = Path("E:/Desktop/New folder (5)/New folder (5)")
  11.  
  12. def process(path: Path) -> tuple[float, int]:
  13. """Process a single PDF file"""
  14. print(f"Processing {path.name}")
  15. try:
  16. with fitz.open(path) as pdf:
  17. start = monotonic()
  18. total_pages = pdf.page_count
  19.  
  20. for i in range(total_pages):
  21. page = pdf.load_page(i)
  22. pix = page.get_pixmap(matrix=fitz.Matrix(9, 9))
  23.  
  24. # Extract text from the page
  25. page_text = page.get_text()
  26.  
  27. # Search for the .htm name in the text
  28. match = re.search(r'(\d+)\.htm', page_text)
  29. if match:
  30. htm_name = match.group(1) # Get the number before .htm
  31.  
  32. # Set the image path based on the .htm name
  33. img_path = TARGET_DIR / f"{htm_name}.png"
  34. with open(img_path, "wb") as f:
  35. img_buffer = BytesIO(pix.tobytes("png"))
  36. f.write(img_buffer.getvalue())
  37.  
  38. else:
  39. print(f"No .htm name found on page {i+1} of {path.name}")
  40.  
  41. return (monotonic() - start, total_pages)
  42. except Exception as e:
  43. print(f"Error processing {path.name}: {e}")
  44. return (0.0, 0)
  45.  
  46. def main() -> None:
  47. TARGET_DIR.mkdir(parents=True, exist_ok=True)
  48. pdf_files = list(SOURCE_DIR.glob("*.pdf"))
  49.  
  50. num_jobs = min(len(pdf_files), cpu_count() * 2)
  51.  
  52. with ProcessPoolExecutor(max_workers=num_jobs) as executor:
  53. executor.map(process, pdf_files)
  54.  
  55. if __name__ == "__main__":
  56. main()
  57.  
Advertisement
Add Comment
Please, Sign In to add comment