ocr with google ai studio

import os
import sys
import tempfile
import google.generativeai as genai
from pdf2image import convert_from_path
from PIL import Image
import time
from datetime import datetime
import glob

# Set Gemini API key directly
genai.configure(api_key="your key here")

def countdown_timer(seconds):
    """
    Display a countdown timer.
    """
    for remaining in range(seconds, 0, -1):
        sys.stdout.write(f"\rWaiting for {remaining} seconds...  ")
        sys.stdout.flush()
        time.sleep(1)
    sys.stdout.write("\rWait complete!            \n")
    sys.stdout.flush()

def update_progress(progress):
    """
    Displays a simple progress bar in the console.
    """
    bar_length = 50
    block = int(round(bar_length * progress))
    text = f"\rProgress: [{'#' * block + '-' * (bar_length - block)}] {round(progress * 100, 2)}%"
    sys.stdout.write(text)
    sys.stdout.flush()

def extract_text_from_gemini_api(image_path, page_num):
    """
    Sends the image to the Gemini API and retrieves the extracted text.
    Added detailed logging and error information.
    """
    try:
        print(f"\nProcessing page {page_num}:")
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Uploading image to Gemini API...")

        myfile = genai.upload_file(image_path)
        model = genai.GenerativeModel("gemini-1.5-pro")

        # Add safety settings to reduce false positives
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
            "HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "BLOCK_NONE",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "BLOCK_NONE"
        }

        print(f"[{datetime.now().strftime('%H:%M:%S')}] Sending request to Gemini API...")

        # Generate content with modified prompt and safety settings
        result = model.generate_content(
            [myfile, "\n\n", "Extract and transcribe any visible text from this image, exactly as it appears."],
            safety_settings=safety_settings
        )

        print(f"[{datetime.now().strftime('%H:%M:%S')}] Response received from Gemini API")

        # Check if response has content
        if hasattr(result, 'text'):
            print("Successfully extracted text from image")
            return result.text
        elif hasattr(result, 'candidates'):
            # Try to get text from candidates
            for candidate in result.candidates:
                if hasattr(candidate, 'content'):
                    print("Successfully extracted text from candidates")
                    return candidate.content.text

        print("Warning: No text content found in API response")
        return "No text could be extracted from this image."

    except Exception as e:
        error_message = f"\nError processing page {page_num}:\n"
        error_message += f"Error Type: {type(e).__name__}\n"
        error_message += f"Error Message: {str(e)}\n"

        if hasattr(e, 'status_code'):
            error_message += f"Status Code: {e.status_code}\n"
        if hasattr(e, 'response'):
            error_message += f"Response: {e.response}\n"
        if hasattr(e, 'details'):
            error_message += f"Details: {e.details}\n"

        print(error_message)
        return f"[ERROR ON PAGE {page_num}]: {error_message}"

def process_pdf(pdf_path, output_txt_path):
    """
    Converts each page of the PDF to an image, extracts text using Gemini API,
    and writes it to a TXT file. Includes a 70-second delay between API calls.
    """
    try:
        print("\nInitializing PDF processing...")
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Converting PDF to images...")
        images = convert_from_path(pdf_path)
        total_pages = len(images)
        print(f"Total pages detected: {total_pages}")
    except Exception as e:
        print(f"Error converting PDF to images: {str(e)}")
        return

    extracted_text = []
    for idx, image in enumerate(images, start=1):
        print(f"\n{'='*50}")
        print(f"Processing page {idx} of {total_pages}")
        print(f"{'='*50}")

        # Create temporary file
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_image:
            image_path = temp_image.name

        # Save image
        print(f"[{datetime.now().strftime('%H:%M:%S')}] Saving temporary image...")
        image.save(image_path, "JPEG")
        print(f"Temporary image saved to: {image_path}")

        # Extract text
        text = extract_text_from_gemini_api(image_path, idx)
        extracted_text.append(text)

        # Remove temporary file immediately after getting the response
        try:
            os.remove(image_path)
            print(f"[{datetime.now().strftime('%H:%M:%S')}] Temporary image removed")
        except Exception as e:
            print(f"\nWarning: Could not remove temporary file {image_path}: {e}")

        # Update progress
        update_progress(idx / total_pages)

        # Add 70-second delay unless I tweak this between API calls if there are more pages to process
        if idx < total_pages:
            print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Starting 70-second cooldown period...")
            countdown_timer(5)

    # Write all extracted text to the output TXT file
    try:
        print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Writing extracted text to file...")
        with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write("\n\n".join(extracted_text))
        print(f"Text extraction complete. Output saved to: {output_txt_path}")
    except Exception as e:
        print(f"\nError writing to TXT file: {e}")

def main():
    """
    Main function to execute the PDF to TXT conversion.
    """
    print('\n********************************')
    print('*** General PDF to TXT Converter ***')
    print('********************************\n')

    # Get list of PDFs in current directory
    pdf_files = glob.glob("*.pdf")

    if not pdf_files:
        print("No PDF files found in the current directory.")
        return

    print("Available PDF files:")
    print("-" * 50)
    for idx, pdf in enumerate(pdf_files, 1):
        print(f"{idx}. {pdf}")
    print("-" * 50)

    # Get user selection
    while True:
        try:
            selection = input("\nEnter the number of the PDF you want to process (or 'q' to quit): ")

            if selection.lower() == 'q':
                print("Exiting program.")
                return

            selection = int(selection)
            if 1 <= selection <= len(pdf_files):
                break
            else:
                print(f"Please enter a number between 1 and {len(pdf_files)}")
        except ValueError:
            print("Please enter a valid number")

    # Get the selected PDF file
    pdf_path = pdf_files[selection - 1]
    print(f"\nSelected: {pdf_path}")

    # Define the output TXT file path
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_txt_path = f"{base_name}.txt"

    # Process the PDF and extract text
    process_pdf(pdf_path, output_txt_path)

if __name__ == "__main__":
    main()