Advertisement
Guest User

Untitled

a guest
Jul 29th, 2023
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.36 KB | History | 0 0
  1. #!/usr/bin/python
  2.  
  3. import json
  4. import logging
  5. import os
  6. import re
  7. import urllib
  8. import urllib2
  9.  
  10. from subprocess import call
  11.  
  12. logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
  13. logger = logging.getLogger(__name__)
  14.  
  15. MAIN_URL = 'https://www.lego.com/en-us/service/buildinginstructions/'
  16. THEME_URL = 'https://www.lego.com//service/biservice/searchbytheme?fromIndex={idx}&onlyAlternatives=false&theme={theme}'
  17. THEME_REGEX = "data-search-themes='(.*?)'"
  18. PATH_TEMPLATE = 'download/{theme}/{productId} - {productName}/{description} - {pdf}'
  19.  
  20. def download_file(url, save_to):
  21. path = os.path.join(os.path.split(save_to)[:-1])[0]
  22. mkdir = ["mkdir", "-p", path]
  23. call(mkdir)
  24.  
  25. wget = ["wget", "-T20", "--tries=3", "--retry-connrefused", "--continue", "-O", save_to, url]
  26. return call(wget)
  27.  
  28. def get(url):
  29. logger.debug('Downloading {0}'.format(urllib.unquote(url)))
  30. class context:
  31. retries = 0
  32.  
  33. def err(e):
  34. logger.error(e)
  35. context.retries = context.retries + 1
  36. if context.retries == 3:
  37. raise e
  38.  
  39. while True:
  40. try:
  41. opener = urllib2.build_opener()
  42. opener.addheaders = [('User-Agent', 'Lego Scrape 1.0')]
  43. response = opener.open(url)
  44. break
  45. except urllib2.HTTPError as e:
  46. if e.code == 404:
  47. raise e
  48. else:
  49. err(e)
  50. except urllib2.URLError as e:
  51. err(e)
  52.  
  53. data = response.read()
  54. return data
  55.  
  56. def get_themes():
  57. page = get(MAIN_URL)
  58. themes_json = re.findall(THEME_REGEX, page)[0]
  59. return json.loads(themes_json)
  60.  
  61. def get_theme_products(theme):
  62. theme_filename = 'data/{}.json'.format(theme)
  63. if os.path.exists(theme_filename):
  64. with open(theme_filename) as fin:
  65. return json.load(fin)
  66.  
  67. last_data = {'moreData': True}
  68. products = []
  69. idx = 0
  70. while last_data['moreData']:
  71. theme_json = get(THEME_URL.format(idx=idx, theme=theme))
  72. last_data = json.loads(theme_json)
  73. products.extend(last_data['products'])
  74. idx += 10
  75.  
  76. with open(theme_filename, 'w') as fout:
  77. json.dump(products, fout)
  78.  
  79. return products
  80.  
  81. def get_all_products():
  82. themes = get_themes()
  83. all_products = []
  84. for theme in themes:
  85. all_products.extend(get_theme_products(theme['Key']))
  86. return all_products
  87.  
  88. def safe_name(file_name):
  89. file_name = file_name.replace('+', 'plus')
  90. file_name = file_name.replace(':', '-')
  91. return re.sub('[^-a-zA-Z0-9_.() ]', '_', file_name)
  92.  
  93. def get_all_urls_and_paths():
  94. for product in get_all_products():
  95. for download in product['buildingInstructions']:
  96. path = PATH_TEMPLATE.format(theme=safe_name(product['themeName']),
  97. productId=safe_name(product['productId']),
  98. productName=safe_name(product['productName']),
  99. description=safe_name(download['description']),
  100. pdf=safe_name(download['pdfLocation'].split('/')[-1]))
  101. url = download['pdfLocation']
  102. yield url, path
  103.  
  104. def main():
  105. for url, path in get_all_urls_and_paths():
  106. download_file(url, path)
  107.  
  108. if __name__ == '__main__':
  109. main()
Tags: lego
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement