Guest User

Untitled

a guest
Oct 21st, 2017
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.87 KB | None | 0 0
  1. """This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory.
  2.  
  3. This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python
  4. Last edited Jan 26, 2017 G. Deemer"""
  5.  
  6. import urllib2
  7. import os
  8. from cookielib import CookieJar
  9. from HTMLParser import HTMLParser
  10.  
  11. # Define a custom HTML parser to scrape the contents of the HTML data table
  12. class MyHTMLParser(HTMLParser):
  13. def __init__(self):
  14. HTMLParser.__init__(self)
  15. self.inLink = False
  16. self.dataList = []
  17. self.directory = '/'
  18. self.indexcol = ';'
  19. self.Counter = 0
  20.  
  21. def handle_starttag(self, tag, attrs):
  22. self.inLink = False
  23. if tag == 'table':
  24. self.Counter += 1
  25. if tag == 'a':
  26. for name, value in attrs:
  27. if name == 'href':
  28. if self.directory in value or self.indexcol in value:
  29. break
  30. else:
  31. self.inLink = True
  32. self.lasttag = tag
  33.  
  34. def handle_endtag(self, tag):
  35. if tag == 'table':
  36. self.Counter +=1
  37.  
  38. def handle_data(self, data):
  39. if self.Counter == 1:
  40. if self.lasttag == 'a' and self.inLink and data.strip():
  41. self.dataList.append(data)
  42.  
  43. parser = MyHTMLParser()
  44.  
  45. # Define function for batch downloading
  46. def BatchJob(Files, cookie_jar):
  47. for dat in Files:
  48. print "downloading: ", dat
  49. JobRequest = urllib2.Request(url+dat)
  50. JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request
  51. JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401'
  52.  
  53. # Request the resource at the modified redirect url
  54. Request = urllib2.Request(JobRedirect_url)
  55. Response = urllib2.urlopen(Request)
  56. f = open( dat, 'wb')
  57. f.write(Response.read())
  58. f.close()
  59. Response.close()
  60. print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__))
  61. #===========================================================================
  62. # The following code block is used for HTTPS authentication
  63. #===========================================================================
  64.  
  65. # The user credentials that will be used to authenticate access to the data
  66. username = "isaque"
  67. password = "Isaque12345"
  68.  
  69. # The FULL url of the directory which contains the files you would like to bulk download
  70.  
  71. url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL
  72. # Create a password manager to deal with the 401 reponse that is returned from
  73. # Earthdata Login
  74.  
  75. password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
  76. password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password)
  77.  
  78. # Create a cookie jar for storing cookies. This is used to store and return
  79. # the session cookie given to use by the data server (otherwise it will just
  80. # keep sending us back to Earthdata Login to authenticate). Ideally, we
  81. # should use a file based cookie jar to preserve cookies between runs. This
  82. # will make it much more efficient.
  83.  
  84. cookie_jar = CookieJar()
  85.  
  86. # Install all the handlers.
  87. opener = urllib2.build_opener(
  88. urllib2.HTTPBasicAuthHandler(password_manager),
  89. #urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see
  90. #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses
  91. urllib2.HTTPCookieProcessor(cookie_jar))
  92. urllib2.install_opener(opener)
  93.  
  94. # Create and submit the requests. There are a wide range of exceptions that
  95. # can be thrown here, including HTTPError and URLError. These should be
  96. # caught and handled.
  97.  
  98. #===========================================================================
  99. # Open a requeset to grab filenames within a directory. Print optional
  100. #===========================================================================
  101.  
  102. DirRequest = urllib2.Request(url)
  103. DirResponse = urllib2.urlopen(DirRequest)
  104.  
  105. # Get the redirect url and append 'app_type=401'
  106. # to do basic http auth
  107. DirRedirect_url = DirResponse.geturl()
  108. DirRedirect_url += '&app_type=401'
  109.  
  110. # Request the resource at the modified redirect url
  111. DirRequest = urllib2.Request(DirRedirect_url)
  112. DirResponse = urllib2.urlopen(DirRequest)
  113.  
  114. DirBody = DirResponse.read(DirResponse)
  115.  
  116. # Uses the HTML parser defined above to pring the content of the directory containing data
  117. parser.feed(DirBody)
  118. Files = parser.dataList
  119.  
  120. # Display the contents of the python list declared in the HTMLParser class
  121. # print Files #Uncomment to print a list of the files
  122.  
  123. #=========================================================================
  124. # Call the function to download all files in url
  125. #=========================================================================
  126.  
  127. BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory
Add Comment
Please, Sign In to add comment