Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import io
- import sys
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- def convert_pdf_to_txt(path):
- rsrcmgr = PDFResourceManager()
- retstr = io.StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- fp = open(path, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos = set()
- for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
- password=password,
- caching=caching,
- check_extractable=True):
- interpreter.process_page(page)
- text = retstr.getvalue()
- fp.close()
- device.close()
- retstr.close()
- print(text)
- input('press enter')
- return text
- if __name__ == "__main__":
- convert_pdf_to_txt(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement