Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- --------------------------
- # EXPLANATION
- # EN https://neculaifantanaru.com/en/convert-pdf-into-txt-using-tesseract-ocr.html
- # RO https://neculaifantanaru.com/convert-pdf-into-txt-using-tesseract-ocr.html
- # --------------------------
- import platform
- from tempfile import TemporaryDirectory
- from pathlib import Path
- import os
- import pytesseract
- from pdf2image import convert_from_path
- import PIL.Image
- PIL.Image.MAX_IMAGE_PIXELS = 933120000
- if platform.system() == "Windows":
- print("windows")
- # We may need to do some additional downloading and setup...
- # Windows needs a PyTesseract Download
- # https://github.com/UB-Mannheim/tesseract/wiki/Downloading-Tesseract-OCR-Engine
- pytesseract.pytesseract.tesseract_cmd = (
- r"c:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
- )
- # Windows also needs poppler_exe
- # https://anaconda.org/conda-forge/poppler/files
- # Go to System Properties window (search on windows) -> click “Environment Variables.” -> PATH -> Edit -> New -> c:\Program Files\poppler-22.04.0\Library\bin (put there the version of poppler with bin)
- path_to_poppler_exe = Path(r"c:\Program Files\poppler-22.04.0\Library\bin")
- # Put our output files in a sane place...
- out_directory = Path(r"~\Desktop").expanduser()
- else:
- out_directory = Path("~").expanduser()
- print(out_directory)
- # base psth in which all the xml files is present
- base_path = r"1"
- subfolders = []
- converted_files = Path(r"converted files").expanduser()
- # getting names of all xml files
- for file in os.listdir(base_path):
- subfolders.append(file)
- for i in range(len(subfolders)):
- print(f"Converted {subfolders[i]} \n")
- input_file = base_path+"/"+subfolders[i]
- # Path of the Input pdf
- PDF_file = Path(input_file)
- output_file = subfolders[i].replace(".pdf",".txt")
- # Store all the pages of the PDF in a variable
- image_file_list = []
- text_file = converted_files / Path(output_file)
- ''' Main execution point of the program'''
- with TemporaryDirectory() as tempdir:
- # Create a temporary directory to hold our temporary images.
- if platform.system() == "Windows":
- # print("Windows")
- pdf_pages = convert_from_path(PDF_file, 500, poppler_path=path_to_poppler_exe)
- else:
- pdf_pages = convert_from_path(PDF_file, 500)
- # Read in the PDF file at 500 DPI
- # Iterate through all the pages stored above
- for page_enumeration, page in enumerate(pdf_pages, start=1):
- # enumerate() "counts" the pages for us.
- # Create a file name to store the image
- filename = f"{tempdir}\page_{page_enumeration:03}.jpg"
- # Declaring filename for each page of PDF as JPG
- # For each page, filename will be:
- # PDF page 1 -> page_001.jpg
- # PDF page 2 -> page_002.jpg
- # PDF page 3 -> page_003.jpg
- # ....
- # PDF page n -> page_00n.jpg
- # Save the image of the page in system
- page.save(filename, "JPEG")
- image_file_list.append(filename)
- """
- Part #2 - Recognizing text from the images using OCR
- """
- with open(text_file, "a") as output_file:
- # Open the file in append mode so that
- # All contents of all images are added to the same file
- # Iterate from 1 to total number of pages
- for image_file in image_file_list:
- # Set filename to recognize text from
- # Again, these files will be:
- # page_1.jpg
- # page_2.jpg
- # ....
- # page_n.jpg
- # Recognize the text as string in image using pytesserct
- text = str(((pytesseract.image_to_string(PIL.Image.open(image_file)))))
- # The recognized text is stored in variable text
- # Any string processing may be applied on text
- # Here, basic formatting has been done:
- # In many PDFs, at line ending, if a word can't
- # be written fully, a 'hyphen' is added.
- # The rest of the word is written in the next line
- # Eg: This is a sample text this word here GeeksF-
- # orGeeks is half on first line, remaining on next.
- # To remove this, we replace every '-\n' to ''.
- text = text.replace("-\n", "")
- print(text)
- # Finally, write the processed text to the file.
- output_file.write(text)
- # At the end of the with .. output_file block
- # the file is closed after writing all the text.
- # At the end of the with .. tempdir block, the
- # TemporaryDirectory() we're using gets removed!
- # End of main function!
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement