Advertisement
nicuf

convert pdf into txt using tesseract ocr

Oct 9th, 2022 (edited)
1,670
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.09 KB | None | 0 0
  1. --------------------------
  2. # EXPLANATION
  3. # EN  https://neculaifantanaru.com/en/convert-pdf-into-txt-using-tesseract-ocr.html
  4. # RO  https://neculaifantanaru.com/convert-pdf-into-txt-using-tesseract-ocr.html
  5. # --------------------------
  6.  
  7.  
  8. import platform
  9. from tempfile import TemporaryDirectory
  10. from pathlib import Path
  11. import os
  12.  
  13. import pytesseract
  14. from pdf2image import convert_from_path
  15. import PIL.Image
  16. PIL.Image.MAX_IMAGE_PIXELS = 933120000
  17.  
  18. if platform.system() == "Windows":
  19.     print("windows")
  20.     # We may need to do some additional downloading and setup...
  21.     # Windows needs a PyTesseract Download
  22.     # https://github.com/UB-Mannheim/tesseract/wiki/Downloading-Tesseract-OCR-Engine
  23.  
  24.     pytesseract.pytesseract.tesseract_cmd = (
  25.         r"c:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
  26.     )
  27.  
  28.     # Windows also needs poppler_exe
  29.     # https://anaconda.org/conda-forge/poppler/files
  30.     # Go to System Properties window (search on windows) -> click “Environment Variables.” -> PATH -> Edit -> New -> c:\Program Files\poppler-22.04.0\Library\bin   (put there the version of poppler with bin)
  31.     path_to_poppler_exe = Path(r"c:\Program Files\poppler-22.04.0\Library\bin")
  32.      
  33.     # Put our output files in a sane place...
  34.     out_directory = Path(r"~\Desktop").expanduser()
  35. else:
  36.     out_directory = Path("~").expanduser()
  37.     print(out_directory)  
  38.  
  39. # base psth in which all the xml files is present
  40. base_path = r"1"
  41. subfolders = []
  42. converted_files = Path(r"converted files").expanduser()
  43.  
  44. # getting names of all xml files
  45. for file in os.listdir(base_path):
  46.   subfolders.append(file)
  47.  
  48. for i in range(len(subfolders)):
  49.     print(f"Converted {subfolders[i]} \n")
  50.     input_file = base_path+"/"+subfolders[i]
  51.     # Path of the Input pdf
  52.     PDF_file = Path(input_file)
  53.     output_file = subfolders[i].replace(".pdf",".txt")
  54.     # Store all the pages of the PDF in a variable
  55.     image_file_list = []
  56.    
  57.     text_file = converted_files / Path(output_file)
  58.    
  59.     ''' Main execution point of the program'''
  60.     with TemporaryDirectory() as tempdir:
  61.         # Create a temporary directory to hold our temporary images.
  62.         if platform.system() == "Windows":
  63.             # print("Windows")
  64.             pdf_pages = convert_from_path(PDF_file, 500, poppler_path=path_to_poppler_exe)
  65.         else:
  66.             pdf_pages = convert_from_path(PDF_file, 500)
  67.         # Read in the PDF file at 500 DPI
  68.    
  69.         # Iterate through all the pages stored above
  70.         for page_enumeration, page in enumerate(pdf_pages, start=1):
  71.             # enumerate() "counts" the pages for us.
  72.    
  73.             # Create a file name to store the image
  74.             filename = f"{tempdir}\page_{page_enumeration:03}.jpg"
  75.  
  76.             # Declaring filename for each page of PDF as JPG
  77.                 # For each page, filename will be:
  78.                 # PDF page 1 -> page_001.jpg
  79.                 # PDF page 2 -> page_002.jpg
  80.                 # PDF page 3 -> page_003.jpg
  81.                 # ....
  82.                 # PDF page n -> page_00n.jpg
  83.    
  84.                 # Save the image of the page in system
  85.             page.save(filename, "JPEG")
  86.             image_file_list.append(filename)
  87.    
  88.             """
  89.            Part #2 - Recognizing text from the images using OCR
  90.            """
  91.    
  92.         with open(text_file, "a") as output_file:
  93.                 # Open the file in append mode so that
  94.                 # All contents of all images are added to the same file
  95.    
  96.                 # Iterate from 1 to total number of pages
  97.             for image_file in image_file_list:
  98.    
  99.                     # Set filename to recognize text from
  100.                     # Again, these files will be:
  101.                     # page_1.jpg
  102.                     # page_2.jpg
  103.                     # ....
  104.                     # page_n.jpg
  105.    
  106.                     # Recognize the text as string in image using pytesserct
  107.                 text = str(((pytesseract.image_to_string(PIL.Image.open(image_file)))))
  108.    
  109.                     # The recognized text is stored in variable text
  110.                     # Any string processing may be applied on text
  111.                     # Here, basic formatting has been done:
  112.                     # In many PDFs, at line ending, if a word can't
  113.                     # be written fully, a 'hyphen' is added.
  114.                     # The rest of the word is written in the next line
  115.                     # Eg: This is a sample text this word here GeeksF-
  116.                     # orGeeks is half on first line, remaining on next.
  117.                     # To remove this, we replace every '-\n' to ''.
  118.                 text = text.replace("-\n", "")
  119.                 print(text)
  120.                     # Finally, write the processed text to the file.
  121.                 output_file.write(text)
  122.    
  123.                 # At the end of the with .. output_file block
  124.                 # the file is closed after writing all the text.
  125.             # At the end of the with .. tempdir block, the
  126.             # TemporaryDirectory() we're using gets removed!      
  127.         # End of main function!
  128.      
  129.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement