paris museum scraper

#!/usr/bin/env python3

# Scrapes pictures in public domain from:
# http://parismuseescollections.paris.fr/en/recherche/image-libre/true?solrsort=ds_created%20desc

import threading
import zipfile
import io
import requests
import os
import re


class Get_pictures():
    """
    Gets source of the paris museum
    and scrapes every picture it can

    Checks for duplicates.
    """

    def __init__(self, url):

        # HTML source of the webpage with pictures no. 1
        self.source = requests.get(url).text

        # Filter out urls of pics. Only the number part at the end:
        # http://parismuseescollections.paris.fr/en/node/83967
        self.picture_number_list = re.compile(
            'about="/en/node/(\d+)" ').findall(self.source)


# No need to download what has been downloaded.
list_downloaded = os.listdir('./Pictures/')

# Link to download also has only the picture number in it
# so there is no need to check in each individial picture link
# to find it. All we need are the ending numbers. :D

# There are 2172 pages with urls like this.
all_page_urls = ['http://parismuseescollections.paris.fr/en/recherche/image-libre/true?page={}&solrsort=ds_created%20desc'.format(i) for i in range(0, 2172)]


def downloader(list_downloaded, url):
    """
    Gets url and downloads all pictures there.
    """
    picture_number_list = Get_pictures(url).picture_number_list

    for i in picture_number_list:
        if i not in list_downloaded:
            # Show progress
            print(i)

            try:
                # Download and extract zips
                r = requests.get(
                    'http://parismuseescollections.paris.fr/en/zip/oeuvre/{}'
                    .format(i))
                z = zipfile.ZipFile(io.BytesIO(r.content))
                # Save extracted zip to individual dirs in dir Pictures
                z.extractall('./Pictures/{}'.format(i))
            except:
                with open('./Errors.txt', 'a') as dat:
                    dat.write(str(i) + '\n')


# Might work a little faster if we pull more than one picture at the same time
list_threads = []
for url in all_page_urls:
    list_threads.append(
        threading.Thread(target=downloader(list_downloaded, url)))

for thread in list_threads:
    thread.start()