Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import json
- import logging
- import os
- import re
- import urllib
- import urllib2
- from subprocess import call
- logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
- logger = logging.getLogger(__name__)
- MAIN_URL = 'https://www.lego.com/en-us/service/buildinginstructions/'
- THEME_URL = 'https://www.lego.com//service/biservice/searchbytheme?fromIndex={idx}&onlyAlternatives=false&theme={theme}'
- THEME_REGEX = "data-search-themes='(.*?)'"
- PATH_TEMPLATE = 'download/{theme}/{productId} - {productName}/{description} - {pdf}'
- def download_file(url, save_to):
- path = os.path.join(os.path.split(save_to)[:-1])[0]
- mkdir = ["mkdir", "-p", path]
- call(mkdir)
- wget = ["wget", "-T20", "--tries=3", "--retry-connrefused", "--continue", "-O", save_to, url]
- return call(wget)
- def get(url):
- logger.debug('Downloading {0}'.format(urllib.unquote(url)))
- class context:
- retries = 0
- def err(e):
- logger.error(e)
- context.retries = context.retries + 1
- if context.retries == 3:
- raise e
- while True:
- try:
- opener = urllib2.build_opener()
- opener.addheaders = [('User-Agent', 'Lego Scrape 1.0')]
- response = opener.open(url)
- break
- except urllib2.HTTPError as e:
- if e.code == 404:
- raise e
- else:
- err(e)
- except urllib2.URLError as e:
- err(e)
- data = response.read()
- return data
- def get_themes():
- page = get(MAIN_URL)
- themes_json = re.findall(THEME_REGEX, page)[0]
- return json.loads(themes_json)
- def get_theme_products(theme):
- theme_filename = 'data/{}.json'.format(theme)
- if os.path.exists(theme_filename):
- with open(theme_filename) as fin:
- return json.load(fin)
- last_data = {'moreData': True}
- products = []
- idx = 0
- while last_data['moreData']:
- theme_json = get(THEME_URL.format(idx=idx, theme=theme))
- last_data = json.loads(theme_json)
- products.extend(last_data['products'])
- idx += 10
- with open(theme_filename, 'w') as fout:
- json.dump(products, fout)
- return products
- def get_all_products():
- themes = get_themes()
- all_products = []
- for theme in themes:
- all_products.extend(get_theme_products(theme['Key']))
- return all_products
- def safe_name(file_name):
- file_name = file_name.replace('+', 'plus')
- file_name = file_name.replace(':', '-')
- return re.sub('[^-a-zA-Z0-9_.() ]', '_', file_name)
- def get_all_urls_and_paths():
- for product in get_all_products():
- for download in product['buildingInstructions']:
- path = PATH_TEMPLATE.format(theme=safe_name(product['themeName']),
- productId=safe_name(product['productId']),
- productName=safe_name(product['productName']),
- description=safe_name(download['description']),
- pdf=safe_name(download['pdfLocation'].split('/')[-1]))
- url = download['pdfLocation']
- yield url, path
- def main():
- for url, path in get_all_urls_and_paths():
- download_file(url, path)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement