isendrak

gelbeseiten.py

Mar 1st, 2018
454
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.95 KB | None | 0 0
  1. #filename: gelbeseiten.py
  2. #license: https://creativecommons.org/licenses/by-sa/4.0/
  3. #coding: utf-8
  4. import codecs, socket, http.client, re, base64, urllib.parse, sys, html
  5. from collections import namedtuple
  6. Address = namedtuple('Address', ['name', 'street', 'postcode', 'locality', 'phone', 'email'])
  7. def get_first(array, default=None):
  8.     if len(array) > 0: return array[0]
  9.     return default
  10. def httpsGet(url):
  11.     url = urllib.parse.urlsplit(url)
  12.     try:
  13.         while True:
  14.             if url.scheme.upper()=='HTTP':
  15.                 connection = http.client.HTTPConnection(url.netloc)
  16.             elif url.scheme.upper()=='HTTPS':
  17.                 connection = http.client.HTTPSConnection(url.netloc)
  18.             else:
  19.                 return -2, None
  20.             connection.request('GET', url.path+'?'+url.query)
  21.             response = connection.getresponse()
  22.             if response.status in (301, 302):
  23.                 url = urllib.parse.urlsplit(response.getheader('Location'))
  24.                 response.close()
  25.                 connection.close()
  26.             else:
  27.                 break
  28.         body = response.read()
  29.         response.close()
  30.         connection.close()
  31.         return response.status, body
  32.     except socket.gaierror as e:
  33.         return -1, e
  34.     except OSError as e:
  35.         return -1, e
  36. def parse_page(pagedata):
  37.     addresses = []
  38.     for result in re.findall("(<article.*?</article>)", pagedata):
  39.         name = get_first(re.findall('<span[ \t]+itemprop="name">(.*?)</span>', result),'')
  40.         street = get_first(re.findall('<span[ \t]+itemprop="streetAddress">(.*?)</span>', result),'')
  41.         postcode = get_first(re.findall('<span[ \t]+itemprop="postalCode">(.*?)</span>', result),'')
  42.         locality = get_first(re.findall('<span[ \t]+itemprop="addressLocality">(.*?)</span>', result),'')
  43.         phone = get_first(re.findall('<span[ \t]+class="nummer">(.*?)</span>', result),'')
  44.         email = get_first(re.findall('href="mailto:([^\\?"]+)', result),'')
  45.         suffix = get_first(re.findall('<span[ \t]+class="suffix[^"]*"[^>]*>', result), None)
  46.         if suffix:
  47.             phone += str(base64.b64decode(get_first(re.findall('data-telsuffix="([^"]+)"', suffix),'')),'utf-8')
  48.         addresses.append(Address(html.unescape(name), html.unescape(street), html.unescape(postcode), html.unescape(locality), html.unescape(phone), html.unescape(email)))
  49.     return addresses
  50. def search(what, where):
  51.     pid = 1
  52.     addresses = []
  53.     while True:
  54.         code, data = httpsGet('https://www.gelbeseiten.de/{0}/{1}/s{2}'.format(urllib.parse.quote(what), urllib.parse.quote(where), pid))
  55.         if code != 200: break
  56.         try:
  57.             data = str(data, 'utf-8')
  58.         except UnicodeDecodeError:
  59.             data = str(data, 'iso-8859-1')
  60.         data = data.replace('\r','').replace('\n','')
  61.         addresses += parse_page(data)
  62.         pid += 1
  63.         yield addresses
  64. if len(sys.argv) < 3:
  65.     print('Syntax: {0} <was> <wo>'.format(sys.argv[0]))
  66. else:
  67.     what = sys.argv[1]
  68.     where = sys.argv[2]
  69.     print('"Name";"Straße";"PLZ";"Stadt";"Telefon","E-Mail"')
  70.     for block in search(what, where):
  71.         for address in block:
  72.             print('"{0}";"{1}";"{2}";"{3}";"{4}";"{5}"'.format(address.name,address.street,address.postcode,address.locality,address.phone,address.email))
Add Comment
Please, Sign In to add comment