Guest User

Untitled

a guest
Oct 19th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.23 KB | None | 0 0
  1. from bs4 import BeautifulSoup # for parsing HTML
  2. import os # for managing files
  3. import sys # for cleaner stdout
  4. import urllib.request # for downloading images
  5. from fpdf import FPDF # for generating PDFs
  6.  
  7. # page content saved as HTML file
  8. inputfile = 'MUS-17-Tricia-Rose-reading.html'
  9. output = inputfile[:-5] + '.pdf'
  10.  
  11. # sets up BeautifulSoup HTML parser
  12. body = open(inputfile, encoding='utf8').read()
  13. soup = BeautifulSoup(body, 'html.parser')
  14.  
  15. # retrieves all images
  16. directory = 'images/'
  17. images = []
  18. if not os.path.exists(directory):
  19. os.makedirs(directory)
  20. for link in soup.find_all('a'):
  21. href = str(link.get('href'))
  22. if 'cdn.alexanderstreet.com' in href:
  23. filename = 'images/' + href[-24:-18] + href[-4:]
  24. sys.stdout.write("\rRetrieving image: " + href)
  25. sys.stdout.flush()
  26. urllib.request.urlretrieve(href, filename)
  27. images.append(filename)
  28. print("\nRetrieved %s images." % len(images))
  29.  
  30. # writes PDF
  31. print()
  32. pdf = FPDF()
  33. for image in images:
  34. pdf.add_page()
  35. pdf.image(image, 0, 0, 210, 297)
  36. print("Writing PDF...")
  37. pdf.output(output, 'F')
  38. print("Successfully wrote PDF: " + output)
  39.  
  40. # cleans up
  41. for image in images:
  42. os.remove(image)
  43. os.rmdir(directory)
  44. print("Cleaned up.")
Add Comment
Please, Sign In to add comment