Guest User

OpenAI-Vision API PDF to text

a guest
Nov 4th, 2024
1,785
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.09 KB | None | 0 0
  1. import os
  2. import sys
  3. import base64
  4. import openai
  5. from pdf2image import convert_from_path
  6. import tempfile
  7.  
  8. # Set your OpenAI API key
  9. openai.api_key = 'your-openai-api-key-here'  
  10.  
  11. def update_progress(progress):
  12.     """
  13.    Displays a simple progress bar in the console.
  14.    """
  15.     bar_length = 50
  16.     block = int(round(bar_length * progress))
  17.     text = f"\rProgress: [{'#' * block + '-' * (bar_length - block)}] {round(progress * 100, 2)}%"
  18.     sys.stdout.write(text)
  19.     sys.stdout.flush()
  20.  
  21. def encode_image(image_path):
  22.     """
  23.    Encodes an image file to a base64 string.
  24.    """
  25.     with open(image_path, "rb") as image_file:
  26.         return base64.b64encode(image_file.read()).decode('utf-8')
  27.  
  28. def extract_text_from_openai_api(image_path):
  29.     """
  30.    Sends the base64-encoded image to the OpenAI API and retrieves the extracted text.
  31.    """
  32.     base64_image = encode_image(image_path)
  33.     try:
  34.         response = openai.ChatCompletion.create(
  35.             model="gpt-4o-mini",
  36.             messages=[
  37.                 {
  38.                     "role": "user",
  39.                     "content": [
  40.                         {
  41.                             "type": "text",
  42.                             "text": "Extract the text from this image, ensuring all text is captured accurately. Do not include any markdown or code formatting."
  43.                         },
  44.                         {
  45.                             "type": "image_url",
  46.                             "image_url": {
  47.                                 "url": f"data:image/jpeg;base64,{base64_image}"
  48.                             },
  49.                         },
  50.                     ],
  51.                 }
  52.             ],
  53.         )
  54.         return response.choices[0].message['content']
  55.     except Exception as e:
  56.         print(f"\nError extracting text from image {image_path}: {e}")
  57.         return ""
  58.  
  59. def process_pdf(pdf_path, output_txt_path):
  60.     """
  61.    Converts each page of the PDF to an image, extracts text using OpenAI API, and writes to a TXT file.
  62.    """
  63.     try:
  64.         # Convert PDF to images
  65.         print("Converting PDF to images...")
  66.         images = convert_from_path(pdf_path)
  67.         total_pages = len(images)
  68.     except Exception as e:
  69.         print(f"Error converting PDF to images: {e}")
  70.         return
  71.  
  72.     extracted_text = []
  73.  
  74.     for idx, image in enumerate(images, start=1):
  75.         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_image:
  76.             image_path = temp_image.name
  77.             image.save(image_path, "JPEG")
  78.  
  79.         # Extract text from image using OpenAI API
  80.         text = extract_text_from_openai_api(image_path)
  81.         extracted_text.append(text)
  82.  
  83.         # Remove the temporary image file
  84.         os.remove(image_path)
  85.  
  86.         # Update progress
  87.         update_progress(idx / total_pages)
  88.  
  89.     # Write all extracted text to the output TXT file
  90.     try:
  91.         with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
  92.             txt_file.write("\n".join(extracted_text))
  93.         print(f"\nText extraction complete. Output saved to: {output_txt_path}")
  94.     except Exception as e:
  95.         print(f"\nError writing to TXT file: {e}")
  96.  
  97. def main():
  98.     """
  99.    Main function to execute the PDF to TXT conversion.
  100.    """
  101.     print('\n********************************')
  102.     print('*** General PDF to TXT Converter ***')
  103.     print('********************************\n')
  104.  
  105.     # Prompt user for the PDF file path
  106.     pdf_path = input('Enter the full path to the PDF file: ').strip()
  107.  
  108.     if not os.path.isfile(pdf_path):
  109.         print(f'The path "{pdf_path}" does not exist or is not a file.')
  110.         return
  111.  
  112.     if not pdf_path.lower().endswith('.pdf'):
  113.         print("The provided file is not a PDF.")
  114.         return
  115.  
  116.     # Define the output TXT file path
  117.     base_name = os.path.splitext(os.path.basename(pdf_path))[0]
  118.     output_dir = os.path.dirname(pdf_path)
  119.     output_txt_path = os.path.join(output_dir, f"{base_name}.txt")
  120.  
  121.     # Process the PDF and extract text
  122.     process_pdf(pdf_path, output_txt_path)
  123.  
  124. if __name__ == "__main__":
  125.     main()
  126.  
Add Comment
Please, Sign In to add comment