Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import argparse
- import os
- import re
- import shutil
- import time
- import urllib.error
- import urllib.request
- def _parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('--root', default = os.path.dirname(os.path.abspath(__file__)))
- parser.add_argument('--rate', help = 'min seconds per request', type = float, default = 1.0)
- return parser.parse_args()
- def wrapped_urlopen(request, f_open = urllib.request.urlopen):
- try:
- return f_open(request)
- except urllib.error.HTTPError as e:
- print('HTTPError', e.code, e.reason)
- except urllib.error.URLError as e2:
- print('URLError', e2.reason)
- return None
- def wait():
- global clock, rate
- time.sleep(max(0, rate - time.perf_counter() + clock))
- clock = time.perf_counter()
- return None
- def get(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:10.0) Gecko/20100101 Firefox/10.0'}):
- wait()
- print('Scraping: %s' %url)
- request = urllib.request.Request(url = url, headers = headers)
- return wrapped_urlopen(request)
- def _main(args):
- global clock, rate
- clock, rate = 0, args.rate
- outdir = os.path.join(args.root, 'images')
- if not os.path.exists(outdir): os.mkdir(outdir)
- base_url = 'http://readonly.mgewiki.com'
- gallery_url = 'http://readonly.mgewiki.com/index.php?title=Special:NewFiles'
- offset = ''
- while gallery_url != None:
- gallery_dat = get(gallery_url).read().decode('utf-8')
- tmp_offset = re.findall('href="/index.php\?title=Special:NewFiles&offset=([^&]*?)"', gallery_dat, re.DOTALL)[0]
- gallery_url = None if tmp_offset == offset else '%s/index.php?title=Special:NewFiles&offset=%s' %(base_url, tmp_offset)
- offset = tmp_offset
- for url in ['%s/w/File:%s' %(base_url, file) for file in re.findall('href="/w/File:(.*?)"', gallery_dat, re.DOTALL)]:
- image_path = os.path.join(outdir, url.split(':')[-1])
- if os.path.exists(image_path): continue
- image_url = '%s/images/%s' %(base_url, re.findall('href="/images/(?!thumb)(.*?)"', get(url).read().decode('utf-8'), re.DOTALL)[0])
- if os.path.exists(os.path.join(outdir, image_url.split('/')[-1])): continue
- with get(image_url) as response, open(os.path.join(outdir, image_url.split('/')[-1]), 'wb') as f: shutil.copyfileobj(response, f)
- return None
- if __name__ == '__main__':
- args = _parse_args()
- try:
- _main(args)
- except KeyboardInterrupt:
- pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement