genericPaster

PDF OCR

Sep 2nd, 2024
830
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.57 KB | None | 0 0
  1. import argparse
  2. import base64
  3. import requests
  4. import fitz  # PyMuPDF
  5. import io
  6.  
  7. def process_pdf(file_path):
  8.     try:
  9.         doc = fitz.open(file_path)
  10.         processed_images = []
  11.        
  12.         for page in doc:
  13.             pix = page.get_pixmap()
  14.             img_bytes = pix.tobytes("jpeg")
  15.             base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
  16.             processed_images.append(base64_encoded)
  17.        
  18.         return processed_images, doc.page_count
  19.     except Exception as e:
  20.         print(f"Error processing PDF: {str(e)}")
  21.         return None, None
  22.  
  23. class LLMProcessor:
  24.     def __init__(self, api_url, api_password):
  25.         self.api_url = api_url
  26.         self.headers = {
  27.             "Content-Type": "application/json",
  28.             "Authorization": f"Bearer {api_password}",
  29.         }
  30.  
  31.     def send_image_to_llm(self, base64_image, page_number, total_pages):
  32.         prompt = f"<|im_start|>user\nRepeat verbatim all text on the image.<|im_end|>\n<|im_start|>assistant\n"
  33.         payload = {
  34.             "prompt": prompt,
  35.             "max_length": 2048,
  36.             "images": [base64_image],
  37.             "temp": 0,
  38.         }
  39.         response = requests.post(f"{self.api_url}/api/v1/generate", json=payload, headers=self.headers)
  40.         if response.status_code == 200:
  41.             return response.json()["results"][0].get("text")
  42.         else:
  43.             print(f"Error: {response.status_code} - {response.text}")
  44.             return None
  45.  
  46. def main():
  47.     parser = argparse.ArgumentParser(description="Send all PDF images to LLM API")
  48.     parser.add_argument("pdf_path", help="Path to the PDF file")
  49.     parser.add_argument("--api-url", default="http://localhost:5001", help="URL for the LLM API")
  50.     parser.add_argument("--api-password", default="", help="Password for the LLM API")
  51.     args = parser.parse_args()
  52.  
  53.     llm_processor = LLMProcessor(args.api_url, args.api_password)
  54.  
  55.     base64_images, total_pages = process_pdf(args.pdf_path)
  56.     if base64_images and total_pages:
  57.         print(f"Processing PDF with {total_pages} pages.")
  58.         for i, base64_image in enumerate(base64_images, start=1):
  59.             print(f"\nProcessing page {i} of {total_pages}:")
  60.             result = llm_processor.send_image_to_llm(base64_image, i, total_pages)
  61.             if result:
  62.                 print("LLM Response:")
  63.                 print(result)
  64.             else:
  65.                 print(f"Failed to get a response from the LLM for page {i}.")
  66.     else:
  67.         print("Failed to process the PDF.")
  68.  
  69. if __name__ == "__main__":
  70.     main()
Add Comment
Please, Sign In to add comment