Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import requests
- from lxml import html, etree
- import socks
- import csv
- import requests.exceptions
- import re
- hrefs2 = []
- hrefs = []
- data = []
- proxies = [
- 'socks5://46.148.112.222:1085',
- 'socks5://95.85.80.213:1085',
- 'socks5://185.101.71.76:1085',
- 'socks5://5.189.201.218:1085',
- 'socks5://141.101.201.151:1085',
- 'socks5://146.185.205.15:1085',
- 'socks5://193.106.31.11:1085',
- 'socks5://95.181.217.149:1085',
- 'socks5://5.62.152.73:1085'
- ]
- for prox in proxies:
- while True:
- base_url = "https://www.bundes-telefonbuch.de/suche/b"
- url1 = 'https://www.bundes-telefonbuch.de/'
- session = requests.Session()
- try:
- response = session.get(base_url,proxies = dict(http = prox))
- except requests.exceptions.ConnectionError:
- continue
- parsed_body = html.fromstring(response.text)
- elements = parsed_body.find_class('listitempage')
- for element in elements:
- for href in element.findall('.//a'):
- x = href.get('href')
- hrefs.append(x)
- for url2 in hrefs:
- pages_urls = url1 + str(url2)
- response2 = session.get(pages_urls, proxies = dict(http = prox))
- body = html.fromstring(response2.content.decode('utf-8'))
- name = body.find_class('panel-title')
- for d in name:
- for hr in d.findall('.//a'):
- a = hr.get('href')
- hrefs2.append(a)
- for url3 in hrefs2:
- pages_urls2 = url1 + str(url3)
- if len(pages_urls2)>55:
- response3 = session.get(pages_urls2, proxies = dict(http = prox))
- body2 = html.fromstring(response3.content.decode('utf-8'))
- elname = body2.find_class('col-xs-12')
- for eln in elname:
- for hre in eln.findall('.//meta'):
- n = hre.get('content')
- print n
- elmail = body2.find_class('table')
- for e in elmail:
- k = list(e.itertext())
- for j in k:
- mail = re.findall(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", j)
- if mail:
- mail = mail
- print ';',mail
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement