Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pdf2image
- from PIL import Image
- import time
- import urllib.request
- def download_lidl1():
- url = 'https://media.lidl-flyer.com/cdce4d18-7327-11e9-a340-005056ab0fb6/GAZETKA-OD-13.05-Oferta-wa%C5%BCna-od-13.05-do-15.05-01.pdf'
- urllib.request.urlretrieve(url, 'lidl/1/lidl1.pdf')
- def download_lidl2():
- url = 'https://media.lidl-flyer.com/72e91026-6ff7-11e9-a340-005056ab0fb6/KATALOG-OD-13.05-Oferta-wa%C5%BCna-od-13.05-do-18.05-02.pdf'
- urllib.request.urlretrieve(url, 'lidl/2/lidl2.pdf')
- #DECLARE CONSTANTS
- PDF_PATH_LIDL1 = "lidl/1/lidl1.pdf"
- PDF_PATH_LIDL2 = "lidl/2/lidl2.pdf"
- DPI = 200
- OUTPUT_FOLDER = None
- FIRST_PAGE = None
- LAST_PAGE = None
- FORMAT = 'jpg'
- THREAD_COUNT = 1
- USERPWD = None
- USE_CROPBOX = False
- STRICT = False
- def pdftopil_lidl1():
- #This method reads a pdf and converts it into a sequence of images
- #PDF_PATH sets the path to the PDF file
- #dpi parameter assists in adjusting the resolution of the image
- #output_folder parameter sets the path to the folder to which the PIL images can be stored (optional)
- #first_page parameter allows you to set a first page to be processed by pdftoppm
- #last_page parameter allows you to set a last page to be processed by pdftoppm
- #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
- #thread_count parameter allows you to set how many thread will be used for conversion.
- #userpw parameter allows you to set a password to unlock the converted PDF
- #use_cropbox parameter allows you to use the crop box instead of the media box when converting
- #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
- pil_images_lidl1 = pdf2image.convert_from_path(PDF_PATH_LIDL1, dpi=DPI, output_folder=OUTPUT_FOLDER, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
- return pil_images_lidl1
- def pdftopil_lidl2():
- pil_images_lidl2 = pdf2image.convert_from_path(PDF_PATH_LIDL2, dpi=DPI, output_folder=OUTPUT_FOLDER, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
- return pil_images_lidl2
- def save_images_lidl1(pil_images_lidl1):
- #This method helps in converting the images in PIL Image file format to the required image format
- index = 1
- for image in pil_images_lidl1:
- image.save("lidl/1/page_" + str(index) + ".jpg")
- index += 1
- def save_images_lidl2(pil_images_lidl2):
- index = 1
- for image in pil_images_lidl2:
- image.save("lidl/2/page_" + str(index) + ".jpg")
- print(index)
- index += 1
- if __name__ == "__main__":
- download_lidl1()
- pil_images_lidl1 = pdftopil_lidl1()
- save_images_lidl1(pil_images_lidl1)
- download_lidl2()
- pil_images_lidl2 = pdftopil_lidl2()
- save_images_lidl2(pil_images_lidl2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement