Advertisement
Guest User

paris museum scraper

a guest
Feb 20th, 2020
251
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.31 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. # Scrapes pictures in public domain from:
  4. # http://parismuseescollections.paris.fr/en/recherche/image-libre/true?solrsort=ds_created%20desc
  5.  
  6. import threading
  7. import zipfile
  8. import io
  9. import requests
  10. import os
  11. import re
  12.  
  13.  
  14. class Get_pictures():
  15.     """
  16.    Gets source of the paris museum
  17.    and scrapes every picture it can
  18.  
  19.    Checks for duplicates.
  20.    """
  21.  
  22.     def __init__(self, url):
  23.  
  24.         # HTML source of the webpage with pictures no. 1
  25.         self.source = requests.get(url).text
  26.  
  27.         # Filter out urls of pics. Only the number part at the end:
  28.         # http://parismuseescollections.paris.fr/en/node/83967
  29.         self.picture_number_list = re.compile(
  30.             'about="/en/node/(\d+)" ').findall(self.source)
  31.  
  32.  
  33. # No need to download what has been downloaded.
  34. list_downloaded = os.listdir('./Pictures/')
  35.  
  36. # Link to download also has only the picture number in it
  37. # so there is no need to check in each individial picture link
  38. # to find it. All we need are the ending numbers. :D
  39.  
  40. # There are 2172 pages with urls like this.
  41. all_page_urls = ['http://parismuseescollections.paris.fr/en/recherche/image-libre/true?page={}&solrsort=ds_created%20desc'.format(i) for i in range(0, 2172)]
  42.  
  43.  
  44. def downloader(list_downloaded, url):
  45.     """
  46.    Gets url and downloads all pictures there.
  47.    """
  48.     picture_number_list = Get_pictures(url).picture_number_list
  49.  
  50.     for i in picture_number_list:
  51.         if i not in list_downloaded:
  52.             # Show progress
  53.             print(i)
  54.  
  55.             try:
  56.                 # Download and extract zips
  57.                 r = requests.get(
  58.                     'http://parismuseescollections.paris.fr/en/zip/oeuvre/{}'
  59.                     .format(i))
  60.                 z = zipfile.ZipFile(io.BytesIO(r.content))
  61.                 # Save extracted zip to individual dirs in dir Pictures
  62.                 z.extractall('./Pictures/{}'.format(i))
  63.             except:
  64.                 with open('./Errors.txt', 'a') as dat:
  65.                     dat.write(str(i) + '\n')
  66.  
  67.  
  68. # Might work a little faster if we pull more than one picture at the same time
  69. list_threads = []
  70. for url in all_page_urls:
  71.     list_threads.append(
  72.         threading.Thread(target=downloader(list_downloaded, url)))
  73.  
  74. for thread in list_threads:
  75.     thread.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement