ASF Download Script

#!/usr/bin/python

#
# Usage:
#
#    In a terminal/command line, cd to the directory where this file lives. Then...
#
#    With embedded urls: ( download the hardcoded list of files in the 'files = []' block below)
#
#       python ./download-all-2017-2-19_14-49-4.py
#
#    Download all files in a Metalink/CSV: (downloaded from ASF Vertex)
#
#       python ./download-all-2017-2-19_14-49-4.py /path/to/downloads.metalink localmetalink.metalink localcsv.csv
#
#    Compatability: python >= 2.6.5, 2.7.5, 3.0
#
#    For more information, navigate to https://www.asf.alaska.edu/data-tools/bulk-download/
#

import sys, csv
import os, os.path
import tempfile, shutil

import base64
import time
import getpass

import xml.etree.ElementTree as ET

#############
# This next block is a bunch of Python 2/3 compatability

try:
   # Python 2.x Libs
   from urllib2 import build_opener, install_opener, Request, urlopen, HTTPError
   from urllib2 import URLError, HTTPHandler, HTTPRedirectHandler, HTTPCookieProcessor
   from urllib import addinfourl

   from cookielib import CookieJar
   from StringIO import StringIO

except ImportError:
   # Python 3.x Libs
   from urllib.request import build_opener, install_opener, Request, urlopen
   from urllib.request import HTTPHandler, HTTPRedirectHandler, HTTPCookieProcessor
   from urllib.response import addinfourl
   from urllib.error import HTTPError, URLError

   from http.cookiejar import CookieJar
   from io import StringIO

# List of files to download
files = [
"https://datapool.asf.alaska.edu/RAW/SB/S1B_EW_RAW__0SDV_20170217T234042_20170217T234152_004347_0078D9_B1A3.zip",
"https://datapool.asf.alaska.edu/RAW/SA/S1A_IW_RAW__0SDV_20161127T223438_20161127T223510_014134_016D1C_1A83.zip",
"https://datapool.asf.alaska.edu/RAW/SB/S1B_EW_RAW__0SDV_20161125T234046_20161125T234156_003122_0054ED_51FB.zip"]
# Local stash of datapool cookie so we don't always have to ask
cookie_file_path = os.path.join( os.path.expanduser('~'), ".asf_datapool_cookie.txt")

# Some internal URS4 Auth stuff
asf_urs4 = { 'url': 'https://urs.earthdata.nasa.gov/oauth/authorize',
             'client': 'BO_n7nTIlMljdvU6kRRB3g',
             'redir': 'https://vertex.daac.asf.alaska.edu/services/urs4_token_request'}

# Get and validate a cookie
def get_cookie():
   cookie = None

   # check for existing datapool cookie
   if os.path.isfile( cookie_file_path ):
      with open(cookie_file_path, 'r') as cookie_file:
         cookie = cookie_file.read()

      # make sure cookie is still valid
      if check_cookie(cookie):
         print(" > Reusing previous valid datapool cookie.")
         return (cookie)
      else:
         cookie = None

   # We don't have a valid cookie, prompt user or creds
   if cookie is None:
      print ("No existing datapool access cookie found, please enter Earthdata username & password:")
      print ("(Credentials will not be stored, saved or logged anywhere)")

   # Keep trying 'till user gets the right U:P
   while cookie is None:
      cookie = get_new_cookie()

   return cookie

# Stash cookie so we don't alway ask for auth
def write_cookie_to_file(cookie):

   if os.path.isfile( cookie_file_path ):
      if os.access(cookie_file_path, os.W_OK) is False:
         print ("Cannot write cookie file!")
         return False

   cookie_file = open(cookie_file_path, 'w')
   cookie_file.write(cookie)
   return True

# Validate cookie before we begin
def check_cookie (cookie):
   if cookie is None:
      return False

   # File we know is valid, used to validate cookie
   file_check = 'https://datapool.asf.alaska.edu/GEOTIFF/SS/SS_01499_STD_F1309_tif.zip'

   #catch redirects, since that would mean a problem w/ the cookie
   class NoRedirectHandler(HTTPRedirectHandler):
      def http_error_302(self, req, fp, code, msg, headers):
         infourl = addinfourl(fp, headers, req.get_full_url())
         infourl.status = code
         infourl.code = code
         return infourl
      http_error_300 = http_error_302
      http_error_301 = http_error_302
      http_error_303 = http_error_302

   # Apply custom Redirect Hanlder
   opener = build_opener(NoRedirectHandler())
   install_opener(opener)

   # Attempt a HEAD request
   request = Request(file_check)
   request.add_header('Cookie', 'datapool='+cookie)
   request.get_method = lambda : 'HEAD'
   try:
      response = urlopen(request)
      resp_code = response.getcode()

   except HTTPError as e:
      # If we ge this error, again, it likely means the user has not agreed to current EULA
      print ("\nIMPORTANT: ")
      print ("Your user appears to lack permissions to download data from the ASF Datapool. This is probably because you have not aggreed to the current EULA. Please log into URS from Vertex (https://vertex.daac.asf.alaska.edu) and confirm the EULA. If this error persists, please contact ASF User Services\n\n")
      exit(-1)

   # This return codes indicate the USER has not been approved to download the data
   if resp_code in (300, 301, 302, 303):
      try:
         redir_url = response.info().getheader('Location')
      except AttributeError:
         redir_url = response.getheader('Location')

      #Funky Test env:
      if ("vertex.daac.asf.alaska.edu" in redir_url and "test" in asf_urs4['redir']):
         print ("Cough, cough. Its dusty in this test env!")
         return True

      print ("Redirect ({0}) occured, invalid datapool cookie value!".format(resp_code))
      return False

   # These are successes!
   if resp_code in (200, 307):
      return True

   return False

def get_new_cookie():

   # Start by prompting user to input their credentials

   # Another Python2/3 workaround
   try:
      new_username = raw_input("Username: ")
   except NameError:
      new_username = input("Username: ")
   new_password = getpass.getpass(prompt="Password (Will not be Echoed!): ")

   # Build URS4 Cookie request
   auth_cookie_url = asf_urs4['url'] + '?client_id=' + asf_urs4['client'] + '&redirect_uri=' + asf_urs4['redir'] + '&response_type=code&state=';

   try:
      #python2
      user_pass = base64.b64encode (bytes(new_username+":"+new_password))
   except TypeError:
      #python3
      user_pass = base64.b64encode (bytes(new_username+":"+new_password, "utf-8"))
      user_pass = user_pass.decode("utf-8")

   # Authenticate against URS, grab all the cookies
   cj = CookieJar()
   opener = build_opener(HTTPCookieProcessor(cj), HTTPHandler())
   request = Request(auth_cookie_url, headers={"Authorization": "Basic {0}".format(user_pass)})

   # Watch out cookie rejection!
   try:
      response = opener.open(request)
   except HTTPError as e:
      if e.code == 401:
         print (" > Username and Password combo was not successful. Please try again.")
         return None
      else:
         # If an error happens here, the user most likely has not confirmed EULA.
         print ("\nIMPORTANT: There was an error obtaining a download cookie!")
         print ("Your user appears to lack permissions to download data from the ASF Datapool. This is probably because you have not aggreed to the current EULA. Please log into URS from Vertex (https://vertex.daac.asf.alaska.edu) and confirm the EULA. If this error persists, please contact ASF User Services\n\n")
         exit(-1)
   except URLError as e:
      print ("\nIMPORTANT: There was a problem communicating with the URS, unable to obtain cookie. ")
      print ("Try cookie generation later. ")
      exit(-1)

   # Did we get a cookie?
   for cookie in cj:
      if cookie.name == 'datapool':
         #COOKIE SUCCESS!
         write_cookie_to_file(cookie.value)
         return cookie.value

   # if we aren't successful generating the cookie, nothing will work. Stop here!
   print ("WARNING: Could not generate new cookie! Cannot procede. Please try Username and Password again")
   print ("Response was {0}.".format(response.getcode()))
   exit(-1)

# Download the file
def download_file_with_cookie(file,cookie, cnt, total):

   # see if we've already download this file
   download_file = os.path.basename(file)
   if os.path.isfile(download_file):
      print (" > Download file {0} exists! \n > Skipping download of {1}. ".format(download_file, file))
      print (" > If you want to re-download it, move or remove that file ")
      return None

   # attempt https connection
   try:
      request = Request(file)
      request.add_header('Cookie', 'datapool='+cookie)
      response = urlopen(request)

      # Watch for redirect
      resp_code = response.getcode()
      if response.geturl() != file:
         print (" > Temporary Redirect download @ ASF Remote archive:\n > {0}".format(response.geturl()))

      # seems to be working
      print ("({0}/{1}) Downloading {2}".format(cnt, total, file))

      # Open our local file for writing and build status bar
      tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False)
      chunk_read(response, tf, report_hook=chunk_report)

      tempfile_name = tf.name
      tf.close()

   #handle errors
   except HTTPError as e:
      print ("HTTP Error:", e.code, file)
      if e.code == 401:
         print (" > IMPORTANT: Your user does not have permissions to download this type of data!")
      return False

   except URLError as e:
      print ("URL Error:", e.reason, file)
      return False

   # Return the file size
   shutil.copy(tempfile_name, download_file)
   os.remove(tempfile_name)
   return os.path.getsize(download_file)

#  chunk_report taken from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
def chunk_report(bytes_so_far, chunk_size, total_size):
   percent = float(bytes_so_far) / total_size
   percent = round(percent*100, 2)
   sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" %
       (bytes_so_far, total_size, percent))

   if bytes_so_far >= total_size:
      sys.stdout.write('\n')

#  chunk_read modified from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
def chunk_read(response, local_file, chunk_size=8192, report_hook=None):
   try:
      total_size = response.info().getheader('Content-Length').strip()
   except AttributeError:
      total_size = response.getheader('Content-Length').strip()
   total_size = int(total_size)
   bytes_so_far = 0

   while 1:
      chunk = response.read(chunk_size)
      try:
         local_file.write(chunk)
      except TypeError:
         local_file.write(chunk.decode(local_file.encoding))
      bytes_so_far += len(chunk)

      if not chunk:
         break

      if report_hook:
         report_hook(bytes_so_far, chunk_size, total_size)

   return bytes_so_far

# Get download urls from a metalink file
def process_metalink(ml_file):

   print ("Processing metalink file: {0}".format(ml_file))
   with open(ml_file, 'r') as ml:
      xml = ml.read()

   # Hack to remove annoying namespace
   it = ET.iterparse(StringIO(xml))
   for _, el in it:
      if '}' in el.tag:
         el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
   root = it.root

   dl_urls = []
   files = root.find('files')
   for dl in files:
      dl_urls.append(dl.find('resources').find('url').text)

   if len(dl_urls) > 0:
      return dl_urls
   else:
      return None

# Get download urls from a csv file
def process_csv(csv_file):

   print ("Processing csv file: {0}".format(csv_file))

   dl_urls = []
   with open(csv_file, 'r') as csvf:
      try:
         csvr = csv.DictReader(csvf)
         for row in csvr:
            dl_urls.append(row['URL'])
      except csv.Error as e:
         print ("WARNING: Could not parse file %s, line %d: %s. Skipping." % (csv_file, csvr.line_num, e))
         return None
      except KeyError as e:
         print ("WARNING: Could not find Column URL in file %s. Skipping." % (csv_file))

   if len(dl_urls) > 0:
      return dl_urls
   else:
      return None

if __name__ == "__main__":

   # Make sure we can write it our current directory
   if os.access(os.getcwd(), os.W_OK) is False:
      print ("WARNING: Cannot write to current path! Check permissions for {0}".format(os.getcwd()))
      exit(-1)

   # grab a cookie
   cookie = get_cookie()

   # Check if user handed in a Metalink:
   if len(sys.argv) > 0:
      download_files = []
      input_files = []
      for arg in sys.argv[1:]:
          if arg.endswith('.metalink') or arg.endswith('.csv'):
              if os.path.isfile( arg ):
                 input_files.append( arg )
                 if arg.endswith('.metalink'):
                    new_files = process_metalink(arg)
                 else:
                    new_files = process_csv(arg)
                 if new_files is not None:
                    for file_url in (new_files):
                       download_files.append( file_url )
              else:
                 print (" > I cannot find the input file you specified: {0}".format(arg))
          else:
              print (" > Command line argument '{0}' makes no sense, ignoring".format(arg))

      if len(input_files) > 0:
         if len(download_files) > 0:
            print (" > Processing {0} downloads from {1} input files. ".format(len(download_files), len(input_files)))
            files = download_files
         else:
            print (" > I see you asked me to download files from {0} input files, but they had no downloads! ".format(len(input_files)))
            print (" > I'm super confused and exiting.")
            exit(-1)

   # summary
   total_bytes = 0
   total_time = 0
   cnt = 0
   success = []
   failed = []
   skipped = []

   for file in files:

      # download counter
      cnt += 1

      # set a timer
      start = time.time()

      # run download
      size = download_file_with_cookie(file, cookie, cnt, len(files))

      # calculte rate
      end = time.time()

      # stats:
      if size is None:
         skipped.append(file)

      elif size is not False:
         # Download was good!
         elapsed = end - start
         elapsed = 1.0 if elapsed < 1 else elapsed
         rate = (size/1024**2)/elapsed

         print ("Downloaded {0}b in {1:.2f}secs, Average Rate: {2:.2f}mb/sec".format(size, elapsed, rate))

         # add up metrics
         total_bytes += size
         total_time += elapsed
         success.append( {'file':file, 'size':size } )

      else:
         print ("There was a problem downloading {0}".format(file))
         failed.append(file)

   # Print summary:
   print ("\n\nDownload Summary ")
   print ("--------------------------------------------------------------------------------")
   print ("  Successes: {0} files, {1} bytes ".format(len(success), total_bytes))
   for success_file in success:
      print ("           - {0}  {1:.2f}mb".format(success_file['file'],(success_file['size']/1024.0**2)))
   if len(failed) > 0:
      print ("  Failures: {0} files".format(len(failed)))
      for failed_file in failed:
         print ("          - {0}".format(failed_file))
   if len(skipped) > 0:
      print ("  Skipped: {0} files".format(len(skipped)))
      for skipped_file in skipped:
         print ("          - {0}".format(skipped_file))
   if len(success) > 0:
      print ("  Average Rate: {0:.2f}mb/sec".format( (total_bytes/1024.0**2)/total_time))
   print ("--------------------------------------------------------------------------------")