Guest User

Untitled

a guest
Dec 13th, 2017
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.30 KB | None | 0 0
  1. from StringIO import StringIO
  2.  
  3. from pdfminer.pdfparser import PDFParser
  4. from pdfminer.pdfdocument import PDFDocument
  5. from pdfminer.pdfpage import PDFPage
  6. from pdfminer.pdfinterp import PDFResourceManager
  7. from pdfminer.pdfinterp import PDFPageInterpreter as PI
  8. from pdfminer.pdfdevice import PDFDevice
  9. from pdfminer.converter import TextConverter
  10.  
  11. import utils
  12.  
  13. __all__ = ['PDF']
  14.  
  15. class PDFPageInterpreter(PI):
  16. def process_page(self, page):
  17. if 1 <= self.debug:
  18. print >>stderr, 'Processing page: %r' % page
  19. (x0,y0,x1,y1) = page.mediabox
  20. if page.rotate == 90:
  21. ctm = (0,-1,1,0, -y0,x1)
  22. elif page.rotate == 180:
  23. ctm = (-1,0,0,-1, x1,y1)
  24. elif page.rotate == 270:
  25. ctm = (0,1,-1,0, y1,-x0)
  26. else:
  27. ctm = (1,0,0,1, -x0,-y0)
  28. self.device.outfp.seek(0)
  29. self.device.outfp.buf = ''
  30. self.device.begin_page(page, ctm)
  31. self.render_contents(page.resources, page.contents, ctm=ctm)
  32. self.device.end_page(page)
  33. return self.device.outfp.getvalue()
  34.  
  35. class PDF(list):
  36. def __init__(self, file, password='', just_text=1):
  37. self.parser = PDFParser(file)
  38. self.doc = PDFDocument(self.parser)
  39. if self.doc.is_extractable:
  40. self.resmgr = PDFResourceManager()
  41. self.device = TextConverter(self.resmgr, outfp=StringIO())
  42. self.interpreter = PDFPageInterpreter(
  43. self.resmgr, self.device)
  44. for page in PDFPage.create_pages(self.doc):
  45. self.append(self.interpreter.process_page(page))
  46. self.metadata = self.doc.info
  47. if just_text:
  48. self._cleanup()
  49.  
  50. def _cleanup(self):
  51. """
  52. Frees lots of non-textual information, such as the fonts
  53. and images and the objects that were needed to parse the
  54. PDF.
  55. """
  56. del self.device
  57. del self.doc
  58. del self.parser
  59. del self.resmgr
  60. del self.interpreter
  61.  
  62. def text(self, clean=True):
  63. """
  64. Returns the text of the PDF as a single string.
  65. Options:
  66.  
  67. :clean:
  68. Removes misc cruft, like lots of whitespace.
  69. """
  70. if clean:
  71. return ''.join(utils.trim_whitespace(page) for page in self)
  72. else:
  73. return ''.join(self)
Add Comment
Please, Sign In to add comment