Advertisement
nicuf

Convert PDF in TXT using OCR pytesseract

Jun 18th, 2023
1,176
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.23 KB | None | 0 0
  1. import os
  2. import pytesseract
  3. from PIL import Image
  4. from pdf2image import convert_from_path
  5. from PyPDF2 import PdfFileReader
  6.  
  7. # Path to the folder containing PDF files
  8. input_folder = "d:/doc/doc"
  9.  
  10. # Path to the folder where text files will be saved
  11. output_folder = "d:/doc/doc"
  12.  
  13. # Path to the Tesseract OCR executable (change if necessary)
  14. pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
  15.  
  16. # Get a list of all PDF files in the input folder
  17. files = [f for f in os.listdir(input_folder) if f.endswith(".pdf")]
  18.  
  19. # Loop through each PDF file and convert it to text using OCR
  20. for file in files:
  21.     pdf_path = os.path.join(input_folder, file)
  22.     txt_path = os.path.join(output_folder, os.path.splitext(file)[0] + ".txt")
  23.  
  24.     # Convert PDF pages to images
  25.     images = convert_from_path(pdf_path)
  26.  
  27.     # Perform OCR on images and extract text
  28.     text = ""
  29.     for image in images:
  30.         # text += pytesseract.image_to_string(image)
  31.         text += pytesseract.image_to_string(image, lang='ron') # your document language
  32.  
  33.     # Save the extracted text to a text file
  34.     with open(txt_path, "w", encoding="utf-8") as txt_file:
  35.         txt_file.write(text)
  36.  
  37. print("Conversion complete!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement