Guest User

Untitled

a guest
Nov 17th, 2018
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.64 KB | None | 0 0
  1. if not document.is_extractable:
  2. raise PDFTextExtractionNotAllowed
  3.  
  4. import sys
  5. import getopt
  6. import urllib2
  7. import datetime
  8. import re
  9. from pdfminer.pdfparser import PDFParser
  10. from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFConverter, LTContainer, LTText, LTTextBox, LTImage
  11. from pdfminer.layout import LAParams
  12. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
  13. from urllib2 import Request
  14.  
  15.  
  16. # Define a PDF parser function
  17. def parsePDF(url):
  18.  
  19. # Open the url provided as an argument to the function and read the content
  20. open = urllib2.urlopen(Request(url)).read()
  21.  
  22. # Cast to StringIO object
  23. from StringIO import StringIO
  24. memory_file = StringIO(open)
  25.  
  26. # Create a PDF parser object associated with the StringIO object
  27. parser = PDFParser(memory_file)
  28.  
  29. # Create a PDF document object that stores the document structure
  30. document = PDFDocument(parser)
  31.  
  32. # Check if the document allows text extraction. If not, abort.
  33. if not document.is_extractable:
  34. raise PDFTextExtractionNotAllowed
  35.  
  36. # Define parameters to the PDF device objet
  37. rsrcmgr = PDFResourceManager()
  38. retstr = StringIO()
  39. laparams = LAParams()
  40. codec = 'utf-8'
  41.  
  42. Create a PDF device object
  43. device = PDFDevice(rsrcmgr, retstr, codec = codec, laparams = laparams)
  44. # Create a PDF interpreter object
  45. interpreter = PDFPageInterpreter(rsrcmgr, device)
  46.  
  47. # Process each page contained in the document
  48. for page in PDFPage.create_pages(document):
  49. interpreter.process_page(page)
  50.  
  51. # Construct the url
  52. url = 'http://www.city.pittsburgh.pa.us/police/blotter/blotter_monday.pdf'
  53.  
  54. import urllib2
  55. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  56. from pdfminer.converter import TextConverter
  57. from pdfminer.layout import LAParams
  58. from pdfminer.pdfpage import PDFPage
  59. from cStringIO import StringIO
  60.  
  61.  
  62. def pdf_from_url_to_txt(url):
  63. rsrcmgr = PDFResourceManager()
  64. retstr = StringIO()
  65. codec = 'utf-8'
  66. laparams = LAParams()
  67. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  68. # Open the url provided as an argument to the function and read the content
  69. f = urllib2.urlopen(urllib2.Request(url)).read()
  70. # Cast to StringIO object
  71. fp = StringIO(f)
  72. interpreter = PDFPageInterpreter(rsrcmgr, device)
  73. password = ""
  74. maxpages = 0
  75. caching = True
  76. pagenos = set()
  77. for page in PDFPage.get_pages(fp,
  78. pagenos,
  79. maxpages=maxpages,
  80. password=password,
  81. caching=caching,
  82. check_extractable=True):
  83. interpreter.process_page(page)
  84. fp.close()
  85. device.close()
  86. str = retstr.getvalue()
  87. retstr.close()
  88. return str
  89.  
  90. def pdf_from_url_to_txt(url):
  91. rsrcmgr = PDFResourceManager()
  92. retstr = StringIO()
  93. codec = 'utf-8'
  94. laparams = LAParams()
  95. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  96. f = urllib.request.urlopen(url).read()
  97. fp = BytesIO(f)
  98. interpreter = PDFPageInterpreter(rsrcmgr, device)
  99. password = ""
  100. maxpages = 0
  101. caching = True
  102. pagenos = set()
  103. for page in PDFPage.get_pages(fp,
  104. pagenos,
  105. maxpages=maxpages,
  106. password=password,
  107. caching=caching,
  108. check_extractable=True):
  109. interpreter.process_page(page)
  110. fp.close()
  111. device.close()
  112. str = retstr.getvalue()
  113. retstr.close()
  114. return str
Add Comment
Please, Sign In to add comment