Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #filename: gelbeseiten.py
- #license: https://creativecommons.org/licenses/by-sa/4.0/
- #coding: utf-8
- import codecs, socket, http.client, re, base64, urllib.parse, sys, html
- from collections import namedtuple
- Address = namedtuple('Address', ['name', 'street', 'postcode', 'locality', 'phone', 'email'])
- def get_first(array, default=None):
- if len(array) > 0: return array[0]
- return default
- def httpsGet(url):
- url = urllib.parse.urlsplit(url)
- try:
- while True:
- if url.scheme.upper()=='HTTP':
- connection = http.client.HTTPConnection(url.netloc)
- elif url.scheme.upper()=='HTTPS':
- connection = http.client.HTTPSConnection(url.netloc)
- else:
- return -2, None
- connection.request('GET', url.path+'?'+url.query)
- response = connection.getresponse()
- if response.status in (301, 302):
- url = urllib.parse.urlsplit(response.getheader('Location'))
- response.close()
- connection.close()
- else:
- break
- body = response.read()
- response.close()
- connection.close()
- return response.status, body
- except socket.gaierror as e:
- return -1, e
- except OSError as e:
- return -1, e
- def parse_page(pagedata):
- addresses = []
- for result in re.findall("(<article.*?</article>)", pagedata):
- name = get_first(re.findall('<span[ \t]+itemprop="name">(.*?)</span>', result),'')
- street = get_first(re.findall('<span[ \t]+itemprop="streetAddress">(.*?)</span>', result),'')
- postcode = get_first(re.findall('<span[ \t]+itemprop="postalCode">(.*?)</span>', result),'')
- locality = get_first(re.findall('<span[ \t]+itemprop="addressLocality">(.*?)</span>', result),'')
- phone = get_first(re.findall('<span[ \t]+class="nummer">(.*?)</span>', result),'')
- email = get_first(re.findall('href="mailto:([^\\?"]+)', result),'')
- suffix = get_first(re.findall('<span[ \t]+class="suffix[^"]*"[^>]*>', result), None)
- if suffix:
- phone += str(base64.b64decode(get_first(re.findall('data-telsuffix="([^"]+)"', suffix),'')),'utf-8')
- addresses.append(Address(html.unescape(name), html.unescape(street), html.unescape(postcode), html.unescape(locality), html.unescape(phone), html.unescape(email)))
- return addresses
- def search(what, where):
- pid = 1
- addresses = []
- while True:
- code, data = httpsGet('https://www.gelbeseiten.de/{0}/{1}/s{2}'.format(urllib.parse.quote(what), urllib.parse.quote(where), pid))
- if code != 200: break
- try:
- data = str(data, 'utf-8')
- except UnicodeDecodeError:
- data = str(data, 'iso-8859-1')
- data = data.replace('\r','').replace('\n','')
- addresses += parse_page(data)
- pid += 1
- yield addresses
- if len(sys.argv) < 3:
- print('Syntax: {0} <was> <wo>'.format(sys.argv[0]))
- else:
- what = sys.argv[1]
- where = sys.argv[2]
- print('"Name";"Straße";"PLZ";"Stadt";"Telefon","E-Mail"')
- for block in search(what, where):
- for address in block:
- print('"{0}";"{1}";"{2}";"{3}";"{4}";"{5}"'.format(address.name,address.street,address.postcode,address.locality,address.phone,address.email))
Add Comment
Please, Sign In to add comment