Advertisement
Guest User

buzz_scraper.py

a guest
Jan 8th, 2020
1,304
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.63 KB | None | 0 0
  1. # pip install httplib2
  2. # pip install wget
  3. # pip install bs4
  4.  
  5. import os
  6. import httplib2
  7. import wget
  8. from bs4 import BeautifulSoup, SoupStrainer
  9.  
  10. def get_zip_url_list(url):
  11.     zip_url_list = []
  12.     http = httplib2.Http()
  13.     status, response = http.request(url)
  14.     for link in BeautifulSoup(response, parse_only=SoupStrainer('a'), features="html.parser"):
  15.         if link.has_attr('href'):
  16.             if ".zip" in link['href']:
  17.                 zip_url_list.append(link['href'])
  18.     return zip_url_list
  19.  
  20. def download_from_url(zip_url_list):
  21.     downloaded_list = []
  22.     skipped_list = []
  23.     failed_list = []
  24.     for href in zip_url_list:
  25.         try:
  26.             filename = href.split('/')[-1]
  27.             if filename in os.listdir():
  28.                 print(f'{filename} already exists, skipping...')
  29.                 if not filename in skipped_list:
  30.                     skipped_list.append(filename)
  31.             else:
  32.                 print(f'\nDownloading: {str(href)}')
  33.                 wget.download(str(href))
  34.                 downloaded_list.append(href)
  35.         except Exception as e:
  36.             failed_list.append(href)
  37.             if '404' in e:
  38.                 print(f'Unable to download file, {href}, HTTP Error 404...')
  39.             else:
  40.                 print('Error' + str(e))
  41.     return (downloaded_list, failed_list, skipped_list)
  42.  
  43. def print_results(results):
  44.     downloaded_list, failed_list, skipped_list = results
  45.     print('\n--------------------DOWNLOADED--------------------')
  46.     for url in downloaded_list:
  47.         print(url)
  48.     print('--------------------SKIPPED---------------------')
  49.     for url in skipped_list:
  50.         print(url)
  51.     print('\n--------------------FAILED---------------------')
  52.     for url in failed_list:
  53.         print(url)
  54.     print('\n--------------------SUMMARY---------------------')
  55.     print(f'D: {len(downloaded_list)} | S: {len(skipped_list)} | F: {len(failed_list)}')
  56.  
  57.     try:
  58.         with open('summary.txt', 'w') as filehandle:
  59.             filehandle.writelines(f'D: {len(downloaded_list)} | S: {len(skipped_list)} | F: {len(failed_list)}\n')
  60.             filehandle.writelines('FAILED: %s\n' % failed for failed in failed_list)
  61.             filehandle.writelines('SKIPPED: %s\n' % skipped for skipped in skipped_list)
  62.             filehandle.writelines('DOWNLOADED: %s\n' % downloaded for downloaded in downloaded_list)
  63.     except Exception as e:
  64.         print('Unable to write summary to file...')
  65.         print(e)
  66.        
  67.  
  68.  
  69. url = "https://www.3dbuzz.com/"
  70. zip_url_list = get_zip_url_list(url)
  71. results = download_from_url(zip_url_list)
  72. print_results(results)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement