Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- from pdfminer.converter import TextConverter
- from pdfminer.pdfinterp import PDFPageInterpreter
- from pdfminer.pdfinterp import PDFResourceManager
- from pdfminer.pdfpage import PDFPage
- def extract_text_from_pdf(pdf_path):
- resource_manager = PDFResourceManager()
- fake_file_handle = io.StringIO()
- converter = TextConverter(resource_manager, fake_file_handle)
- page_interpreter = PDFPageInterpreter(resource_manager, converter)
- with open(pdf_path, 'rb') as fh:
- for page in PDFPage.get_pages(fh,
- caching=True,
- check_extractable=True):
- page_interpreter.process_page(page)
- text = fake_file_handle.getvalue()
- # close open handles
- converter.close()
- fake_file_handle.close()
- if text:
- return text
- if __name__ == '__main__':
- print(extract_text_from_pdf('test.pdf'))
- ---------------------------------------------------------------------------
- TypeError Traceback (most recent call last)
- <ipython-input-7-cbf464387547> in <module>
- 28
- 29 if __name__ == '__main__':
- ---> 30 print(extract_text_from_pdf('test.pdf'))
- <ipython-input-7-cbf464387547> in extract_text_from_pdf(pdf_path)
- 16 caching=True,
- 17 check_extractable=True):
- ---> 18 page_interpreter.process_page(page)
- 19
- 20 text = fake_file_handle.getvalue()
- ~/anaconda3/lib/python3.7/site-packages/pdfminer/pdfinterp.py in process_page(self, page)
- 840 def process_page(self, page):
- 841 log.info('Processing page: %r', page)
- --> 842 (x0, y0, x1, y1) = page.mediabox
- 843 if page.rotate == 90:
- 844 ctm = (0, -1, 1, 0, -y0, x1)
- TypeError: cannot unpack non-iterable NoneType object
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement