Advertisement
furas

Python - NYC documents

Sep 8th, 2016
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.06 KB | None | 0 0
  1. #
  2. # http://stackoverflow.com/questions/39398592/python-get-javascript-file-from-href-tag-of-html
  3. #
  4.  
  5. import requests
  6. import lxml.html
  7.  
  8. url = 'http://a810-bisweb.nyc.gov/bisweb/COsByLocationServlet?requestid=1&allbin=3055311'
  9.  
  10. r = requests.get(url)
  11. #print(r.status_code)
  12.  
  13. html = lxml.html.fromstring(r.text)
  14.  
  15. for a in html.xpath('//td[@class="content"]/a[contains(@href, "javascript")]'):
  16.     name = a.attrib['href'][14:-12]
  17.     parts = name.split('_')
  18.     f0 = parts[1]
  19.     f1 = parts[4][0] # B
  20.     f2 = parts[4][1:4] # 000
  21.     f3 = parts[4][4:7] + '000' # 11400
  22.     f4 = parts[4]
  23.    
  24.     download = [
  25.         'http://a810-bisweb.nyc.gov/bisweb/CofoDocumentContentServlet',
  26.         '?cofomatadata1=', f0,
  27.         '&cofomatadata2=', f1,
  28.         '&cofomatadata3=', f2,
  29.         '&cofomatadata4=', f3,
  30.         '&cofomatadata5=', f4,
  31.     ]
  32.  
  33.     download = ''.join(download)
  34.     r = requests.get(download, stream=True)
  35.    
  36.     print('Download:', f4)
  37.  
  38.     with open(f4, 'wb') as fout:
  39.         for chunk in r.iter_content(1024):
  40.             fout.write(chunk)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement