Advertisement
reclosedev

Environmental Health Preston get postcodes

Apr 17th, 2012
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.59 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import functools
  4. import json
  5. import re
  6. from urlparse import urljoin
  7.  
  8. import requests
  9. from requests import async
  10. import lxml.html
  11.  
  12. import requests_cache
  13. # Because I don't want to bother about persistence or explicitly saved state :)
  14. # https://github.com/reclosedev/requests-cache
  15. requests_cache.configure('env_health')
  16.  
  17.  
  18. BASE_URL = 'http://www.environmentalhealthpreston.co.uk/'
  19.  
  20.  
  21. def main():
  22.     # Get 'moreinfo' urls from Zarino Zappia's scrapper https://scraperwiki.com/scrapers/environmental_health_preston/
  23.     r = requests.get('https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsonlist&name=environmental_health_preston&query=select%20url%20from%20%60eateries%60')
  24.     urls = [url[0] for url in json.loads(r.text)['data']]
  25.     results = {}
  26.  
  27.     def process_page(url_key, response):
  28.         doc = lxml.html.fromstring(response.content)
  29.         text = doc.xpath('//div[@class="content-text-sidebar"]//p[1]')[0].text_content()
  30.         results[url_key] = retrieve_postcode(text)
  31.  
  32.     rs = [async.get(urljoin(BASE_URL, url),
  33.                     hooks={'response': functools.partial(process_page, url)})
  34.           for url in urls]
  35.     async.map(rs, size=10)
  36.     print json.dumps(results)
  37.  
  38.  
  39. def retrieve_postcode(text):
  40.     # credits: http://regexlib.com/REDetails.aspx?regexp_id=260&AspxAutoDetectCookieSupport=1
  41.     m = re.search(r'([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)', text)
  42.     if m:
  43.         return m.group(0)
  44.     return ''
  45.  
  46. if __name__ == '__main__':
  47.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement