Advertisement
Guest User

Data Scraper

a guest
Dec 10th, 2019
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.27 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import time
  4. import urllib
  5. import os
  6.  
  7. ARCHIVE_URL = 'http://rammb.cira.colostate.edu/ramsdis/online/archive_hi_res.asp?data_folder=himawari-8/full_disk_ahi_true_color&width=800&height=800'
  8. URL_PREFIX = 'http://rammb.cira.colostate.edu/ramsdis/online/'
  9. IMG_PREFIX = 'images\\'
  10. LINK_STR = 'Hi-Res Image'
  11.  
  12. def main():
  13.     archive_html = get_html_data(ARCHIVE_URL)
  14.     bs_page = BeautifulSoup(archive_html, 'html.parser')
  15.     links = bs_page.findAll('a', string=LINK_STR)
  16.     print('Got Links')
  17.     time.sleep(5.0)
  18.     for i in range(300, len(links)):
  19.         link = links[i]['href']
  20.         img_path = IMG_PREFIX + link.split('/')[-1]
  21.  
  22.         if not os.path.exists(img_path):
  23.             while True:
  24.                 try:
  25.                     urllib.request.urlretrieve(URL_PREFIX + link, img_path)
  26.                 except:
  27.                     print(f'Failed... Retrying')
  28.                     time.sleep(0.1)
  29.                 else:
  30.                     break
  31.            
  32.             print(f'Downloaded image {img_path} | {i}/{len(links)}')
  33.             time.sleep(5.0)
  34.  
  35.  
  36. def get_html_data(url):
  37.     page = requests.get(url, verify=False)
  38.     page.close()
  39.     return page.content
  40.  
  41. if __name__ == '__main__':
  42.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement