Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import functools
- import json
- import re
- from urlparse import urljoin
- import requests
- from requests import async
- import lxml.html
- import requests_cache
- # Because I don't want to bother about persistence or explicitly saved state :)
- # https://github.com/reclosedev/requests-cache
- requests_cache.configure('env_health')
- BASE_URL = 'http://www.environmentalhealthpreston.co.uk/'
- def main():
- # Get 'moreinfo' urls from Zarino Zappia's scrapper https://scraperwiki.com/scrapers/environmental_health_preston/
- r = requests.get('https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=jsonlist&name=environmental_health_preston&query=select%20url%20from%20%60eateries%60')
- urls = [url[0] for url in json.loads(r.text)['data']]
- results = {}
- def process_page(url_key, response):
- doc = lxml.html.fromstring(response.content)
- text = doc.xpath('//div[@class="content-text-sidebar"]//p[1]')[0].text_content()
- results[url_key] = retrieve_postcode(text)
- rs = [async.get(urljoin(BASE_URL, url),
- hooks={'response': functools.partial(process_page, url)})
- for url in urls]
- async.map(rs, size=10)
- print json.dumps(results)
- def retrieve_postcode(text):
- # credits: http://regexlib.com/REDetails.aspx?regexp_id=260&AspxAutoDetectCookieSupport=1
- m = re.search(r'([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {1,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)', text)
- if m:
- return m.group(0)
- return ''
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement