Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- if not document.is_extractable:
- raise PDFTextExtractionNotAllowed
- import sys
- import getopt
- import urllib2
- import datetime
- import re
- from pdfminer.pdfparser import PDFParser
- from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFConverter, LTContainer, LTText, LTTextBox, LTImage
- from pdfminer.layout import LAParams
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
- from urllib2 import Request
- # Define a PDF parser function
- def parsePDF(url):
- # Open the url provided as an argument to the function and read the content
- open = urllib2.urlopen(Request(url)).read()
- # Cast to StringIO object
- from StringIO import StringIO
- memory_file = StringIO(open)
- # Create a PDF parser object associated with the StringIO object
- parser = PDFParser(memory_file)
- # Create a PDF document object that stores the document structure
- document = PDFDocument(parser)
- # Check if the document allows text extraction. If not, abort.
- if not document.is_extractable:
- raise PDFTextExtractionNotAllowed
- # Define parameters to the PDF device objet
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- laparams = LAParams()
- codec = 'utf-8'
- Create a PDF device object
- device = PDFDevice(rsrcmgr, retstr, codec = codec, laparams = laparams)
- # Create a PDF interpreter object
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # Process each page contained in the document
- for page in PDFPage.create_pages(document):
- interpreter.process_page(page)
- # Construct the url
- url = 'http://www.city.pittsburgh.pa.us/police/blotter/blotter_monday.pdf'
- import urllib2
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- from cStringIO import StringIO
- def pdf_from_url_to_txt(url):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- # Open the url provided as an argument to the function and read the content
- f = urllib2.urlopen(urllib2.Request(url)).read()
- # Cast to StringIO object
- fp = StringIO(f)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos = set()
- for page in PDFPage.get_pages(fp,
- pagenos,
- maxpages=maxpages,
- password=password,
- caching=caching,
- check_extractable=True):
- interpreter.process_page(page)
- fp.close()
- device.close()
- str = retstr.getvalue()
- retstr.close()
- return str
- def pdf_from_url_to_txt(url):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- f = urllib.request.urlopen(url).read()
- fp = BytesIO(f)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos = set()
- for page in PDFPage.get_pages(fp,
- pagenos,
- maxpages=maxpages,
- password=password,
- caching=caching,
- check_extractable=True):
- interpreter.process_page(page)
- fp.close()
- device.close()
- str = retstr.getvalue()
- retstr.close()
- return str
Add Comment
Please, Sign In to add comment