Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup # for parsing HTML
- import os # for managing files
- import sys # for cleaner stdout
- import urllib.request # for downloading images
- from fpdf import FPDF # for generating PDFs
- # page content saved as HTML file
- inputfile = 'MUS-17-Tricia-Rose-reading.html'
- output = inputfile[:-5] + '.pdf'
- # sets up BeautifulSoup HTML parser
- body = open(inputfile, encoding='utf8').read()
- soup = BeautifulSoup(body, 'html.parser')
- # retrieves all images
- directory = 'images/'
- images = []
- if not os.path.exists(directory):
- os.makedirs(directory)
- for link in soup.find_all('a'):
- href = str(link.get('href'))
- if 'cdn.alexanderstreet.com' in href:
- filename = 'images/' + href[-24:-18] + href[-4:]
- sys.stdout.write("\rRetrieving image: " + href)
- sys.stdout.flush()
- urllib.request.urlretrieve(href, filename)
- images.append(filename)
- print("\nRetrieved %s images." % len(images))
- # writes PDF
- print()
- pdf = FPDF()
- for image in images:
- pdf.add_page()
- pdf.image(image, 0, 0, 210, 297)
- print("Writing PDF...")
- pdf.output(output, 'F')
- print("Successfully wrote PDF: " + output)
- # cleans up
- for image in images:
- os.remove(image)
- os.rmdir(directory)
- print("Cleaned up.")
Add Comment
Please, Sign In to add comment