Advertisement
Guest User

python 3 mgewiki image scraper

a guest
Oct 19th, 2018
175
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.33 KB | None | 0 0
  1. import argparse
  2. import os
  3. import re
  4. import shutil
  5. import time
  6. import urllib.error
  7. import urllib.request
  8.  
  9. def _parse_args():
  10.     parser = argparse.ArgumentParser()
  11.     parser.add_argument('--root', default = os.path.dirname(os.path.abspath(__file__)))
  12.     parser.add_argument('--rate', help = 'min seconds per request', type = float, default = 1.0)
  13.     return parser.parse_args()
  14.    
  15. def wrapped_urlopen(request, f_open = urllib.request.urlopen):
  16.     try:
  17.         return f_open(request)
  18.     except urllib.error.HTTPError as e:
  19.         print('HTTPError', e.code, e.reason)
  20.     except urllib.error.URLError as e2:
  21.         print('URLError', e2.reason)
  22.     return None
  23.    
  24. def wait():
  25.     global clock, rate
  26.     time.sleep(max(0, rate - time.perf_counter() + clock))
  27.     clock = time.perf_counter()
  28.     return None
  29.    
  30. def get(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0'}):
  31.     wait()
  32.     print('Scraping: %s' %url)
  33.     request = urllib.request.Request(url = url, headers = headers)
  34.     return wrapped_urlopen(request)
  35.    
  36. def _main(args):
  37.     global clock, rate
  38.     clock, rate = 0, args.rate
  39.     outdir = os.path.join(args.root, 'images')
  40.     if not os.path.exists(outdir): os.mkdir(outdir)
  41.     base_url = 'http://readonly.mgewiki.com'
  42.     gallery_url = 'http://readonly.mgewiki.com/index.php?title=Special:NewFiles'
  43.     offset = ''
  44.     while gallery_url != None:
  45.         gallery_dat = get(gallery_url).read().decode('utf-8')
  46.         tmp_offset = re.findall('href="/index.php\?title=Special:NewFiles&offset=([^&]*?)"', gallery_dat, re.DOTALL)[0]
  47.         gallery_url = None if tmp_offset == offset else '%s/index.php?title=Special:NewFiles&offset=%s' %(base_url, tmp_offset)
  48.         offset = tmp_offset
  49.         for url in ['%s/w/File:%s' %(base_url, file) for file in re.findall('href="/w/File:(.*?)"', gallery_dat, re.DOTALL)]:
  50.             image_path = os.path.join(outdir, url.split(':')[-1])
  51.             if os.path.exists(image_path): continue
  52.             image_url = '%s/images/%s' %(base_url, re.findall('href="/images/(?!thumb)(.*?)"', get(url).read().decode('utf-8'), re.DOTALL)[0])
  53.             if os.path.exists(os.path.join(outdir, image_url.split('/')[-1])): continue
  54.             with get(image_url) as response, open(os.path.join(outdir, image_url.split('/')[-1]), 'wb') as f: shutil.copyfileobj(response, f)
  55.     return None
  56.    
  57. if __name__ == '__main__':
  58.     args = _parse_args()
  59.     try:
  60.         _main(args)
  61.     except KeyboardInterrupt:
  62.         pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement