Advertisement
Guest User

Simple Scraper

a guest
Dec 9th, 2019
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.54 KB | None | 0 0
  1. from lxml import html
  2. import posixpath
  3. import urllib2
  4.  
  5. if __name__ == '__main__':
  6.     url = 'https://archive.epa.gov/airtoxics/nata1999/web/html/tables.html'
  7.     t = html.parse(urllib2.urlopen(url))
  8.     select_eles = t.xpath('//select')
  9.     for select_ele in select_eles:
  10.         to_download = []
  11.         base_url = posixpath.dirname(url)
  12.         for option in select_ele.xpath('option'):
  13.             if option.attrib['value'].startswith('javascript'):
  14.                 continue
  15.             print posixpath.join(base_url, option.attrib['value'])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement