Untitled

from bs4 import BeautifulSoup   # for parsing HTML
import os                       # for managing files
import sys                      # for cleaner stdout
import urllib.request           # for downloading images
from fpdf import FPDF           # for generating PDFs

# page content saved as HTML file
inputfile = 'MUS-17-Tricia-Rose-reading.html'
output = inputfile[:-5] + '.pdf'

# sets up BeautifulSoup HTML parser
body = open(inputfile, encoding='utf8').read()
soup = BeautifulSoup(body, 'html.parser')

# retrieves all images
directory = 'images/'
images = []
if not os.path.exists(directory):
    os.makedirs(directory)
for link in soup.find_all('a'):
	href = str(link.get('href'))
	if 'cdn.alexanderstreet.com' in href:
		filename = 'images/' + href[-24:-18] + href[-4:]
		sys.stdout.write("\rRetrieving image: " + href)
		sys.stdout.flush()
		urllib.request.urlretrieve(href, filename)
		images.append(filename)
print("\nRetrieved %s images." % len(images))

# writes PDF
print()
pdf = FPDF()
for image in images:
	pdf.add_page()
	pdf.image(image, 0, 0, 210, 297)
print("Writing PDF...")
pdf.output(output, 'F')
print("Successfully wrote PDF: " + output)

# cleans up
for image in images:
	os.remove(image)
os.rmdir(directory)
print("Cleaned up.")