Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # Scrapes pictures in public domain from:
- # http://parismuseescollections.paris.fr/en/recherche/image-libre/true?solrsort=ds_created%20desc
- import threading
- import zipfile
- import io
- import requests
- import os
- import re
- class Get_pictures():
- """
- Gets source of the paris museum
- and scrapes every picture it can
- Checks for duplicates.
- """
- def __init__(self, url):
- # HTML source of the webpage with pictures no. 1
- self.source = requests.get(url).text
- # Filter out urls of pics. Only the number part at the end:
- # http://parismuseescollections.paris.fr/en/node/83967
- self.picture_number_list = re.compile(
- 'about="/en/node/(\d+)" ').findall(self.source)
- # No need to download what has been downloaded.
- list_downloaded = os.listdir('./Pictures/')
- # Link to download also has only the picture number in it
- # so there is no need to check in each individial picture link
- # to find it. All we need are the ending numbers. :D
- # There are 2172 pages with urls like this.
- all_page_urls = ['http://parismuseescollections.paris.fr/en/recherche/image-libre/true?page={}&solrsort=ds_created%20desc'.format(i) for i in range(0, 2172)]
- def downloader(list_downloaded, url):
- """
- Gets url and downloads all pictures there.
- """
- picture_number_list = Get_pictures(url).picture_number_list
- for i in picture_number_list:
- if i not in list_downloaded:
- # Show progress
- print(i)
- try:
- # Download and extract zips
- r = requests.get(
- 'http://parismuseescollections.paris.fr/en/zip/oeuvre/{}'
- .format(i))
- z = zipfile.ZipFile(io.BytesIO(r.content))
- # Save extracted zip to individual dirs in dir Pictures
- z.extractall('./Pictures/{}'.format(i))
- except:
- with open('./Errors.txt', 'a') as dat:
- dat.write(str(i) + '\n')
- # Might work a little faster if we pull more than one picture at the same time
- list_threads = []
- for url in all_page_urls:
- list_threads.append(
- threading.Thread(target=downloader(list_downloaded, url)))
- for thread in list_threads:
- thread.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement