PDF OCR

import argparse
import base64
import requests
import fitz  # PyMuPDF
import io

def process_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        processed_images = []

        for page in doc:
            pix = page.get_pixmap()
            img_bytes = pix.tobytes("jpeg")
            base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
            processed_images.append(base64_encoded)

        return processed_images, doc.page_count
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return None, None

class LLMProcessor:
    def __init__(self, api_url, api_password):
        self.api_url = api_url
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_password}",
        }

    def send_image_to_llm(self, base64_image, page_number, total_pages):
        prompt = f"<|im_start|>user\nRepeat verbatim all text on the image.<|im_end|>\n<|im_start|>assistant\n"
        payload = {
            "prompt": prompt,
            "max_length": 2048,
            "images": [base64_image],
            "temp": 0,
        }
        response = requests.post(f"{self.api_url}/api/v1/generate", json=payload, headers=self.headers)
        if response.status_code == 200:
            return response.json()["results"][0].get("text")
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None

def main():
    parser = argparse.ArgumentParser(description="Send all PDF images to LLM API")
    parser.add_argument("pdf_path", help="Path to the PDF file")
    parser.add_argument("--api-url", default="http://localhost:5001", help="URL for the LLM API")
    parser.add_argument("--api-password", default="", help="Password for the LLM API")
    args = parser.parse_args()

    llm_processor = LLMProcessor(args.api_url, args.api_password)

    base64_images, total_pages = process_pdf(args.pdf_path)
    if base64_images and total_pages:
        print(f"Processing PDF with {total_pages} pages.")
        for i, base64_image in enumerate(base64_images, start=1):
            print(f"\nProcessing page {i} of {total_pages}:")
            result = llm_processor.send_image_to_llm(base64_image, i, total_pages)
            if result:
                print("LLM Response:")
                print(result)
            else:
                print(f"Failed to get a response from the LLM for page {i}.")
    else:
        print("Failed to process the PDF.")

if __name__ == "__main__":
    main()