Advertisement
Guest User

pdf2txt

a guest
Mar 21st, 2020
173
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.07 KB | None | 0 0
  1. import io
  2. import sys
  3. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  4. from pdfminer.converter import TextConverter
  5. from pdfminer.layout import LAParams
  6. from pdfminer.pdfpage import PDFPage
  7.  
  8.  
  9. def convert_pdf_to_txt(path):
  10.     rsrcmgr = PDFResourceManager()
  11.     retstr = io.StringIO()
  12.     codec = 'utf-8'
  13.     laparams = LAParams()
  14.     device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  15.     fp = open(path, 'rb')
  16.     interpreter = PDFPageInterpreter(rsrcmgr, device)
  17.     password = ""
  18.     maxpages = 0
  19.     caching = True
  20.     pagenos = set()
  21.  
  22.     for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
  23.                                   password=password,
  24.                                   caching=caching,
  25.                                   check_extractable=True):
  26.         interpreter.process_page(page)
  27.  
  28.     text = retstr.getvalue()
  29.  
  30.     fp.close()
  31.     device.close()
  32.     retstr.close()
  33.     print(text)
  34.     input('press enter')
  35.     return text
  36.  
  37.  
  38. if __name__ == "__main__":
  39.     convert_pdf_to_txt(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement