Guest User

ASF Download Script

a guest
Feb 19th, 2017
49
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.41 KB | None | 0 0
  1. #!/usr/bin/python
  2.  
  3. #
  4. # Usage:
  5. #
  6. #    In a terminal/command line, cd to the directory where this file lives. Then...
  7. #
  8. #    With embedded urls: ( download the hardcoded list of files in the 'files = []' block below)
  9. #
  10. #       python ./download-all-2017-2-19_14-49-4.py
  11. #
  12. #    Download all files in a Metalink/CSV: (downloaded from ASF Vertex)
  13. #
  14. #       python ./download-all-2017-2-19_14-49-4.py /path/to/downloads.metalink localmetalink.metalink localcsv.csv
  15. #
  16. #    Compatability: python >= 2.6.5, 2.7.5, 3.0
  17. #
  18. #    For more information, navigate to https://www.asf.alaska.edu/data-tools/bulk-download/
  19. #
  20.  
  21. import sys, csv
  22. import os, os.path
  23. import tempfile, shutil
  24.  
  25. import base64
  26. import time
  27. import getpass
  28.  
  29. import xml.etree.ElementTree as ET
  30.  
  31. #############
  32. # This next block is a bunch of Python 2/3 compatability
  33.  
  34. try:
  35.    # Python 2.x Libs
  36.    from urllib2 import build_opener, install_opener, Request, urlopen, HTTPError
  37.    from urllib2 import URLError, HTTPHandler, HTTPRedirectHandler, HTTPCookieProcessor
  38.    from urllib import addinfourl
  39.  
  40.    from cookielib import CookieJar
  41.    from StringIO import StringIO
  42.  
  43. except ImportError:
  44.    # Python 3.x Libs
  45.    from urllib.request import build_opener, install_opener, Request, urlopen
  46.    from urllib.request import HTTPHandler, HTTPRedirectHandler, HTTPCookieProcessor
  47.    from urllib.response import addinfourl
  48.    from urllib.error import HTTPError, URLError
  49.  
  50.    from http.cookiejar import CookieJar
  51.    from io import StringIO
  52.  
  53. # List of files to download
  54. files = [
  55. "https://datapool.asf.alaska.edu/RAW/SB/S1B_EW_RAW__0SDV_20170217T234042_20170217T234152_004347_0078D9_B1A3.zip",
  56. "https://datapool.asf.alaska.edu/RAW/SA/S1A_IW_RAW__0SDV_20161127T223438_20161127T223510_014134_016D1C_1A83.zip",
  57. "https://datapool.asf.alaska.edu/RAW/SB/S1B_EW_RAW__0SDV_20161125T234046_20161125T234156_003122_0054ED_51FB.zip"]
  58. # Local stash of datapool cookie so we don't always have to ask
  59. cookie_file_path = os.path.join( os.path.expanduser('~'), ".asf_datapool_cookie.txt")
  60.  
  61. # Some internal URS4 Auth stuff
  62. asf_urs4 = { 'url': 'https://urs.earthdata.nasa.gov/oauth/authorize',
  63.              'client': 'BO_n7nTIlMljdvU6kRRB3g',
  64.              'redir': 'https://vertex.daac.asf.alaska.edu/services/urs4_token_request'}
  65.  
  66. # Get and validate a cookie
  67. def get_cookie():
  68.    cookie = None
  69.  
  70.    # check for existing datapool cookie
  71.    if os.path.isfile( cookie_file_path ):
  72.       with open(cookie_file_path, 'r') as cookie_file:
  73.          cookie = cookie_file.read()
  74.  
  75.       # make sure cookie is still valid
  76.       if check_cookie(cookie):
  77.          print(" > Reusing previous valid datapool cookie.")
  78.          return (cookie)
  79.       else:
  80.          cookie = None
  81.  
  82.    # We don't have a valid cookie, prompt user or creds
  83.    if cookie is None:
  84.       print ("No existing datapool access cookie found, please enter Earthdata username & password:")
  85.       print ("(Credentials will not be stored, saved or logged anywhere)")
  86.  
  87.    # Keep trying 'till user gets the right U:P
  88.    while cookie is None:
  89.       cookie = get_new_cookie()
  90.  
  91.    return cookie
  92.  
  93. # Stash cookie so we don't alway ask for auth
  94. def write_cookie_to_file(cookie):
  95.  
  96.    if os.path.isfile( cookie_file_path ):
  97.       if os.access(cookie_file_path, os.W_OK) is False:
  98.          print ("Cannot write cookie file!")
  99.          return False
  100.  
  101.    cookie_file = open(cookie_file_path, 'w')
  102.    cookie_file.write(cookie)
  103.    return True
  104.  
  105. # Validate cookie before we begin
  106. def check_cookie (cookie):
  107.    if cookie is None:
  108.       return False
  109.  
  110.    # File we know is valid, used to validate cookie
  111.    file_check = 'https://datapool.asf.alaska.edu/GEOTIFF/SS/SS_01499_STD_F1309_tif.zip'
  112.  
  113.    #catch redirects, since that would mean a problem w/ the cookie
  114.    class NoRedirectHandler(HTTPRedirectHandler):
  115.       def http_error_302(self, req, fp, code, msg, headers):
  116.          infourl = addinfourl(fp, headers, req.get_full_url())
  117.          infourl.status = code
  118.          infourl.code = code
  119.          return infourl
  120.       http_error_300 = http_error_302
  121.       http_error_301 = http_error_302
  122.       http_error_303 = http_error_302
  123.  
  124.    # Apply custom Redirect Hanlder
  125.    opener = build_opener(NoRedirectHandler())
  126.    install_opener(opener)
  127.  
  128.    # Attempt a HEAD request
  129.    request = Request(file_check)
  130.    request.add_header('Cookie', 'datapool='+cookie)
  131.    request.get_method = lambda : 'HEAD'
  132.    try:
  133.       response = urlopen(request)
  134.       resp_code = response.getcode()
  135.  
  136.    except HTTPError as e:
  137.       # If we ge this error, again, it likely means the user has not agreed to current EULA
  138.       print ("\nIMPORTANT: ")
  139.       print ("Your user appears to lack permissions to download data from the ASF Datapool. This is probably because you have not aggreed to the current EULA. Please log into URS from Vertex (https://vertex.daac.asf.alaska.edu) and confirm the EULA. If this error persists, please contact ASF User Services\n\n")
  140.       exit(-1)
  141.  
  142.    # This return codes indicate the USER has not been approved to download the data
  143.    if resp_code in (300, 301, 302, 303):
  144.       try:
  145.          redir_url = response.info().getheader('Location')
  146.       except AttributeError:
  147.          redir_url = response.getheader('Location')
  148.  
  149.       #Funky Test env:
  150.       if ("vertex.daac.asf.alaska.edu" in redir_url and "test" in asf_urs4['redir']):
  151.          print ("Cough, cough. Its dusty in this test env!")
  152.          return True
  153.  
  154.       print ("Redirect ({0}) occured, invalid datapool cookie value!".format(resp_code))
  155.       return False
  156.  
  157.    # These are successes!
  158.    if resp_code in (200, 307):
  159.       return True
  160.  
  161.    return False
  162.  
  163. def get_new_cookie():
  164.  
  165.    # Start by prompting user to input their credentials
  166.  
  167.    # Another Python2/3 workaround
  168.    try:
  169.       new_username = raw_input("Username: ")
  170.    except NameError:
  171.       new_username = input("Username: ")
  172.    new_password = getpass.getpass(prompt="Password (Will not be Echoed!): ")
  173.  
  174.    # Build URS4 Cookie request
  175.    auth_cookie_url = asf_urs4['url'] + '?client_id=' + asf_urs4['client'] + '&redirect_uri=' + asf_urs4['redir'] + '&response_type=code&state=';
  176.  
  177.    try:
  178.       #python2
  179.       user_pass = base64.b64encode (bytes(new_username+":"+new_password))
  180.    except TypeError:
  181.       #python3
  182.       user_pass = base64.b64encode (bytes(new_username+":"+new_password, "utf-8"))
  183.       user_pass = user_pass.decode("utf-8")
  184.  
  185.    # Authenticate against URS, grab all the cookies
  186.    cj = CookieJar()
  187.    opener = build_opener(HTTPCookieProcessor(cj), HTTPHandler())
  188.    request = Request(auth_cookie_url, headers={"Authorization": "Basic {0}".format(user_pass)})
  189.  
  190.    # Watch out cookie rejection!
  191.    try:
  192.       response = opener.open(request)
  193.    except HTTPError as e:
  194.       if e.code == 401:
  195.          print (" > Username and Password combo was not successful. Please try again.")
  196.          return None
  197.       else:
  198.          # If an error happens here, the user most likely has not confirmed EULA.
  199.          print ("\nIMPORTANT: There was an error obtaining a download cookie!")
  200.          print ("Your user appears to lack permissions to download data from the ASF Datapool. This is probably because you have not aggreed to the current EULA. Please log into URS from Vertex (https://vertex.daac.asf.alaska.edu) and confirm the EULA. If this error persists, please contact ASF User Services\n\n")
  201.          exit(-1)
  202.    except URLError as e:
  203.       print ("\nIMPORTANT: There was a problem communicating with the URS, unable to obtain cookie. ")
  204.       print ("Try cookie generation later. ")
  205.       exit(-1)
  206.  
  207.    # Did we get a cookie?
  208.    for cookie in cj:
  209.       if cookie.name == 'datapool':
  210.          #COOKIE SUCCESS!
  211.          write_cookie_to_file(cookie.value)
  212.          return cookie.value
  213.  
  214.    # if we aren't successful generating the cookie, nothing will work. Stop here!
  215.    print ("WARNING: Could not generate new cookie! Cannot procede. Please try Username and Password again")
  216.    print ("Response was {0}.".format(response.getcode()))
  217.    exit(-1)
  218.  
  219. # Download the file
  220. def download_file_with_cookie(file,cookie, cnt, total):
  221.  
  222.    # see if we've already download this file
  223.    download_file = os.path.basename(file)
  224.    if os.path.isfile(download_file):
  225.       print (" > Download file {0} exists! \n > Skipping download of {1}. ".format(download_file, file))
  226.       print (" > If you want to re-download it, move or remove that file ")
  227.       return None
  228.  
  229.    # attempt https connection
  230.    try:
  231.       request = Request(file)
  232.       request.add_header('Cookie', 'datapool='+cookie)
  233.       response = urlopen(request)
  234.  
  235.       # Watch for redirect
  236.       resp_code = response.getcode()
  237.       if response.geturl() != file:
  238.          print (" > Temporary Redirect download @ ASF Remote archive:\n > {0}".format(response.geturl()))
  239.  
  240.       # seems to be working
  241.       print ("({0}/{1}) Downloading {2}".format(cnt, total, file))
  242.  
  243.       # Open our local file for writing and build status bar
  244.       tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False)
  245.       chunk_read(response, tf, report_hook=chunk_report)
  246.  
  247.       tempfile_name = tf.name
  248.       tf.close()
  249.  
  250.    #handle errors
  251.    except HTTPError as e:
  252.       print ("HTTP Error:", e.code, file)
  253.       if e.code == 401:
  254.          print (" > IMPORTANT: Your user does not have permissions to download this type of data!")
  255.       return False
  256.  
  257.    except URLError as e:
  258.       print ("URL Error:", e.reason, file)
  259.       return False
  260.  
  261.    # Return the file size
  262.    shutil.copy(tempfile_name, download_file)
  263.    os.remove(tempfile_name)
  264.    return os.path.getsize(download_file)
  265.  
  266. #  chunk_report taken from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
  267. def chunk_report(bytes_so_far, chunk_size, total_size):
  268.    percent = float(bytes_so_far) / total_size
  269.    percent = round(percent*100, 2)
  270.    sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" %
  271.        (bytes_so_far, total_size, percent))
  272.  
  273.    if bytes_so_far >= total_size:
  274.       sys.stdout.write('\n')
  275.  
  276. #  chunk_read modified from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
  277. def chunk_read(response, local_file, chunk_size=8192, report_hook=None):
  278.    try:
  279.       total_size = response.info().getheader('Content-Length').strip()
  280.    except AttributeError:
  281.       total_size = response.getheader('Content-Length').strip()
  282.    total_size = int(total_size)
  283.    bytes_so_far = 0
  284.  
  285.    while 1:
  286.       chunk = response.read(chunk_size)
  287.       try:
  288.          local_file.write(chunk)
  289.       except TypeError:
  290.          local_file.write(chunk.decode(local_file.encoding))
  291.       bytes_so_far += len(chunk)
  292.  
  293.       if not chunk:
  294.          break
  295.  
  296.       if report_hook:
  297.          report_hook(bytes_so_far, chunk_size, total_size)
  298.  
  299.    return bytes_so_far
  300.  
  301. # Get download urls from a metalink file
  302. def process_metalink(ml_file):
  303.  
  304.    print ("Processing metalink file: {0}".format(ml_file))
  305.    with open(ml_file, 'r') as ml:
  306.       xml = ml.read()
  307.  
  308.    # Hack to remove annoying namespace
  309.    it = ET.iterparse(StringIO(xml))
  310.    for _, el in it:
  311.       if '}' in el.tag:
  312.          el.tag = el.tag.split('}', 1)[1]  # strip all namespaces
  313.    root = it.root
  314.  
  315.    dl_urls = []
  316.    files = root.find('files')
  317.    for dl in files:
  318.       dl_urls.append(dl.find('resources').find('url').text)
  319.  
  320.    if len(dl_urls) > 0:
  321.       return dl_urls
  322.    else:
  323.       return None
  324.  
  325. # Get download urls from a csv file
  326. def process_csv(csv_file):
  327.  
  328.    print ("Processing csv file: {0}".format(csv_file))
  329.  
  330.    dl_urls = []
  331.    with open(csv_file, 'r') as csvf:
  332.       try:
  333.          csvr = csv.DictReader(csvf)
  334.          for row in csvr:
  335.             dl_urls.append(row['URL'])
  336.       except csv.Error as e:
  337.          print ("WARNING: Could not parse file %s, line %d: %s. Skipping." % (csv_file, csvr.line_num, e))
  338.          return None
  339.       except KeyError as e:
  340.          print ("WARNING: Could not find Column URL in file %s. Skipping." % (csv_file))
  341.  
  342.    if len(dl_urls) > 0:
  343.       return dl_urls
  344.    else:
  345.       return None
  346.  
  347. if __name__ == "__main__":
  348.  
  349.    # Make sure we can write it our current directory
  350.    if os.access(os.getcwd(), os.W_OK) is False:
  351.       print ("WARNING: Cannot write to current path! Check permissions for {0}".format(os.getcwd()))
  352.       exit(-1)
  353.  
  354.    # grab a cookie
  355.    cookie = get_cookie()
  356.  
  357.    # Check if user handed in a Metalink:
  358.    if len(sys.argv) > 0:
  359.       download_files = []
  360.       input_files = []
  361.       for arg in sys.argv[1:]:
  362.           if arg.endswith('.metalink') or arg.endswith('.csv'):
  363.               if os.path.isfile( arg ):
  364.                  input_files.append( arg )
  365.                  if arg.endswith('.metalink'):
  366.                     new_files = process_metalink(arg)
  367.                  else:
  368.                     new_files = process_csv(arg)
  369.                  if new_files is not None:
  370.                     for file_url in (new_files):
  371.                        download_files.append( file_url )
  372.               else:
  373.                  print (" > I cannot find the input file you specified: {0}".format(arg))
  374.           else:
  375.               print (" > Command line argument '{0}' makes no sense, ignoring".format(arg))
  376.  
  377.       if len(input_files) > 0:
  378.          if len(download_files) > 0:
  379.             print (" > Processing {0} downloads from {1} input files. ".format(len(download_files), len(input_files)))
  380.             files = download_files
  381.          else:
  382.             print (" > I see you asked me to download files from {0} input files, but they had no downloads! ".format(len(input_files)))
  383.             print (" > I'm super confused and exiting.")
  384.             exit(-1)
  385.  
  386.    # summary
  387.    total_bytes = 0
  388.    total_time = 0
  389.    cnt = 0
  390.    success = []
  391.    failed = []
  392.    skipped = []
  393.  
  394.    for file in files:
  395.  
  396.       # download counter
  397.       cnt += 1
  398.  
  399.       # set a timer
  400.       start = time.time()
  401.  
  402.       # run download
  403.       size = download_file_with_cookie(file, cookie, cnt, len(files))
  404.  
  405.       # calculte rate
  406.       end = time.time()
  407.  
  408.       # stats:
  409.       if size is None:
  410.          skipped.append(file)
  411.  
  412.       elif size is not False:
  413.          # Download was good!
  414.          elapsed = end - start
  415.          elapsed = 1.0 if elapsed < 1 else elapsed
  416.          rate = (size/1024**2)/elapsed
  417.  
  418.          print ("Downloaded {0}b in {1:.2f}secs, Average Rate: {2:.2f}mb/sec".format(size, elapsed, rate))
  419.  
  420.          # add up metrics
  421.          total_bytes += size
  422.          total_time += elapsed
  423.          success.append( {'file':file, 'size':size } )
  424.  
  425.       else:
  426.          print ("There was a problem downloading {0}".format(file))
  427.          failed.append(file)
  428.  
  429.    # Print summary:
  430.    print ("\n\nDownload Summary ")
  431.    print ("--------------------------------------------------------------------------------")
  432.    print ("  Successes: {0} files, {1} bytes ".format(len(success), total_bytes))
  433.    for success_file in success:
  434.       print ("           - {0}  {1:.2f}mb".format(success_file['file'],(success_file['size']/1024.0**2)))
  435.    if len(failed) > 0:
  436.       print ("  Failures: {0} files".format(len(failed)))
  437.       for failed_file in failed:
  438.          print ("          - {0}".format(failed_file))
  439.    if len(skipped) > 0:
  440.       print ("  Skipped: {0} files".format(len(skipped)))
  441.       for skipped_file in skipped:
  442.          print ("          - {0}".format(skipped_file))
  443.    if len(success) > 0:
  444.       print ("  Average Rate: {0:.2f}mb/sec".format( (total_bytes/1024.0**2)/total_time))
  445.    print ("--------------------------------------------------------------------------------")
Add Comment
Please, Sign In to add comment