Guest User

Untitled

a guest
Aug 18th, 2019
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.24 KB | None | 0 0
  1. #coding: utf-8
  2. import codecs, socket, http.client, re, base64, urllib.parse, sys, html
  3. from collections import namedtuple
  4. Address = namedtuple('Address', ['name', 'street', 'postcode', 'locality', 'phone', 'email'])
  5. def get_first(array, default=None):
  6.     if len(array) > 0: return array[0]
  7.     return default
  8. def httpsGet(url):
  9.     url = urllib.parse.urlsplit(url)
  10.     try:
  11.         while True:
  12.             if url.scheme.upper()=='HTTP':
  13.                 connection = http.client.HTTPConnection(url.netloc)
  14.             elif url.scheme.upper()=='HTTPS':
  15.                 connection = http.client.HTTPSConnection(url.netloc)
  16.             else:
  17.                 return -2, None
  18.             connection.request('GET', url.path+'?'+url.query)
  19.             response = connection.getresponse()
  20.             if response.status in (301, 302):
  21.                 url = urllib.parse.urlsplit(response.getheader('Location'))
  22.                 response.close()
  23.                 connection.close()
  24.             else:
  25.                 break
  26.         body = response.read()
  27.         response.close()
  28.         connection.close()
  29.         return response.status, body
  30.     except socket.gaierror as e:
  31.         return -1, e
  32.     except OSError as e:
  33.         return -1, e
  34. def parse_page(pagedata):
  35.     addresses = []
  36.     for result in re.findall("(<article.*?</article>)", pagedata):
  37.         name = get_first(re.findall('<span[ \t]+itemprop="name">(.*?)</span>', result),'')
  38.         street = get_first(re.findall('<span[ \t]+itemprop="streetAddress">(.*?)</span>', result),'')
  39.         postcode = get_first(re.findall('<span[ \t]+itemprop="postalCode">(.*?)</span>', result),'')
  40.         locality = get_first(re.findall('<span[ \t]+itemprop="addressLocality">(.*?)</span>', result),'')
  41.         phone = get_first(re.findall('<span[ \t]+class="nummer">(.*?)</span>', result),'')
  42.         email = get_first(re.findall('href="mailto:([^\\?"]+)', result),'')
  43.         suffix = get_first(re.findall('<span[ \t]+class="suffix[^"]*"[^>]*>', result), None)
  44.         if suffix:
  45.             phone += str(base64.b64decode(get_first(re.findall('data-telsuffix="([^"]+)"', suffix),'')),'utf-8')
  46.         addresses.append(Address(html.unescape(name), html.unescape(street), html.unescape(postcode), html.unescape(locality), html.unescape(phone), html.unescape(email)))
  47.     return addresses
  48. def search(what, where):
  49.     pid = 1
  50.     addresses = []
  51.     while True:
  52.         code, data = httpsGet('https://www.gelbeseiten.de/{0}/{1}/s{2}'.format(urllib.parse.quote(what), urllib.parse.quote(where), pid))
  53.         if code != 200: break
  54.         try:
  55.             data = str(data, 'utf-8')
  56.         except UnicodeDecodeError:
  57.             data = str(data, 'iso-8859-1')
  58.         data = data.replace('\r','').replace('\n','')
  59.         addresses += parse_page(data)
  60.         pid += 1
  61.         yield addresses
  62. if len(sys.argv) < 3:
  63.     print('Syntax: {0} <was> <wo>'.format(sys.argv[0]))
  64. else:
  65.     what = sys.argv[1]
  66.     where = sys.argv[2]
  67.     print('"Name";"Straße";"PLZ";"Stadt";"Telefon","E-Mail"')
  68.     for block in search(what, where):
  69.         for address in block:
  70.             print('"{0}";"{1}";"{2}";"{3}";"{4}";"{5}"'.format(address.name,address.street,address.postcode,address.locality,address.phone,address.email))
Add Comment
Please, Sign In to add comment