Advertisement
Guest User

ocr with google ai studio

a guest
Nov 8th, 2024
198
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.33 KB | None | 0 0
  1. import os
  2. import sys
  3. import tempfile
  4. import google.generativeai as genai
  5. from pdf2image import convert_from_path
  6. from PIL import Image
  7. import time
  8. from datetime import datetime
  9. import glob
  10.  
  11. # Set Gemini API key directly
  12. genai.configure(api_key="your key here")
  13.  
  14. def countdown_timer(seconds):
  15. """
  16. Display a countdown timer.
  17. """
  18. for remaining in range(seconds, 0, -1):
  19. sys.stdout.write(f"\rWaiting for {remaining} seconds... ")
  20. sys.stdout.flush()
  21. time.sleep(1)
  22. sys.stdout.write("\rWait complete! \n")
  23. sys.stdout.flush()
  24.  
  25. def update_progress(progress):
  26. """
  27. Displays a simple progress bar in the console.
  28. """
  29. bar_length = 50
  30. block = int(round(bar_length * progress))
  31. text = f"\rProgress: [{'#' * block + '-' * (bar_length - block)}] {round(progress * 100, 2)}%"
  32. sys.stdout.write(text)
  33. sys.stdout.flush()
  34.  
  35. def extract_text_from_gemini_api(image_path, page_num):
  36. """
  37. Sends the image to the Gemini API and retrieves the extracted text.
  38. Added detailed logging and error information.
  39. """
  40. try:
  41. print(f"\nProcessing page {page_num}:")
  42. print(f"[{datetime.now().strftime('%H:%M:%S')}] Uploading image to Gemini API...")
  43.  
  44. myfile = genai.upload_file(image_path)
  45. model = genai.GenerativeModel("gemini-1.5-pro")
  46.  
  47. # Add safety settings to reduce false positives
  48. safety_settings = {
  49. "HARM_CATEGORY_HARASSMENT": "BLOCK_NONE",
  50. "HARM_CATEGORY_HATE_SPEECH": "BLOCK_NONE",
  51. "HARM_CATEGORY_SEXUALLY_EXPLICIT": "BLOCK_NONE",
  52. "HARM_CATEGORY_DANGEROUS_CONTENT": "BLOCK_NONE"
  53. }
  54.  
  55. print(f"[{datetime.now().strftime('%H:%M:%S')}] Sending request to Gemini API...")
  56.  
  57. # Generate content with modified prompt and safety settings
  58. result = model.generate_content(
  59. [myfile, "\n\n", "Extract and transcribe any visible text from this image, exactly as it appears."],
  60. safety_settings=safety_settings
  61. )
  62.  
  63. print(f"[{datetime.now().strftime('%H:%M:%S')}] Response received from Gemini API")
  64.  
  65. # Check if response has content
  66. if hasattr(result, 'text'):
  67. print("Successfully extracted text from image")
  68. return result.text
  69. elif hasattr(result, 'candidates'):
  70. # Try to get text from candidates
  71. for candidate in result.candidates:
  72. if hasattr(candidate, 'content'):
  73. print("Successfully extracted text from candidates")
  74. return candidate.content.text
  75.  
  76. print("Warning: No text content found in API response")
  77. return "No text could be extracted from this image."
  78.  
  79. except Exception as e:
  80. error_message = f"\nError processing page {page_num}:\n"
  81. error_message += f"Error Type: {type(e).__name__}\n"
  82. error_message += f"Error Message: {str(e)}\n"
  83.  
  84. if hasattr(e, 'status_code'):
  85. error_message += f"Status Code: {e.status_code}\n"
  86. if hasattr(e, 'response'):
  87. error_message += f"Response: {e.response}\n"
  88. if hasattr(e, 'details'):
  89. error_message += f"Details: {e.details}\n"
  90.  
  91. print(error_message)
  92. return f"[ERROR ON PAGE {page_num}]: {error_message}"
  93.  
  94. def process_pdf(pdf_path, output_txt_path):
  95. """
  96. Converts each page of the PDF to an image, extracts text using Gemini API,
  97. and writes it to a TXT file. Includes a 70-second delay between API calls.
  98. """
  99. try:
  100. print("\nInitializing PDF processing...")
  101. print(f"[{datetime.now().strftime('%H:%M:%S')}] Converting PDF to images...")
  102. images = convert_from_path(pdf_path)
  103. total_pages = len(images)
  104. print(f"Total pages detected: {total_pages}")
  105. except Exception as e:
  106. print(f"Error converting PDF to images: {str(e)}")
  107. return
  108.  
  109. extracted_text = []
  110. for idx, image in enumerate(images, start=1):
  111. print(f"\n{'='*50}")
  112. print(f"Processing page {idx} of {total_pages}")
  113. print(f"{'='*50}")
  114.  
  115. # Create temporary file
  116. with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_image:
  117. image_path = temp_image.name
  118.  
  119. # Save image
  120. print(f"[{datetime.now().strftime('%H:%M:%S')}] Saving temporary image...")
  121. image.save(image_path, "JPEG")
  122. print(f"Temporary image saved to: {image_path}")
  123.  
  124. # Extract text
  125. text = extract_text_from_gemini_api(image_path, idx)
  126. extracted_text.append(text)
  127.  
  128. # Remove temporary file immediately after getting the response
  129. try:
  130. os.remove(image_path)
  131. print(f"[{datetime.now().strftime('%H:%M:%S')}] Temporary image removed")
  132. except Exception as e:
  133. print(f"\nWarning: Could not remove temporary file {image_path}: {e}")
  134.  
  135. # Update progress
  136. update_progress(idx / total_pages)
  137.  
  138. # Add 70-second delay unless I tweak this between API calls if there are more pages to process
  139. if idx < total_pages:
  140. print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Starting 70-second cooldown period...")
  141. countdown_timer(5)
  142.  
  143. # Write all extracted text to the output TXT file
  144. try:
  145. print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Writing extracted text to file...")
  146. with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
  147. txt_file.write("\n\n".join(extracted_text))
  148. print(f"Text extraction complete. Output saved to: {output_txt_path}")
  149. except Exception as e:
  150. print(f"\nError writing to TXT file: {e}")
  151.  
  152. def main():
  153. """
  154. Main function to execute the PDF to TXT conversion.
  155. """
  156. print('\n********************************')
  157. print('*** General PDF to TXT Converter ***')
  158. print('********************************\n')
  159.  
  160. # Get list of PDFs in current directory
  161. pdf_files = glob.glob("*.pdf")
  162.  
  163. if not pdf_files:
  164. print("No PDF files found in the current directory.")
  165. return
  166.  
  167. print("Available PDF files:")
  168. print("-" * 50)
  169. for idx, pdf in enumerate(pdf_files, 1):
  170. print(f"{idx}. {pdf}")
  171. print("-" * 50)
  172.  
  173. # Get user selection
  174. while True:
  175. try:
  176. selection = input("\nEnter the number of the PDF you want to process (or 'q' to quit): ")
  177.  
  178. if selection.lower() == 'q':
  179. print("Exiting program.")
  180. return
  181.  
  182. selection = int(selection)
  183. if 1 <= selection <= len(pdf_files):
  184. break
  185. else:
  186. print(f"Please enter a number between 1 and {len(pdf_files)}")
  187. except ValueError:
  188. print("Please enter a valid number")
  189.  
  190. # Get the selected PDF file
  191. pdf_path = pdf_files[selection - 1]
  192. print(f"\nSelected: {pdf_path}")
  193.  
  194. # Define the output TXT file path
  195. base_name = os.path.splitext(os.path.basename(pdf_path))[0]
  196. output_txt_path = f"{base_name}.txt"
  197.  
  198. # Process the PDF and extract text
  199. process_pdf(pdf_path, output_txt_path)
  200.  
  201. if __name__ == "__main__":
  202. main()
  203.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement