Advertisement
BeMike

Parser

Dec 5th, 2016
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.46 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. import requests
  3. from lxml import html, etree
  4. import socks
  5. import csv
  6. import requests.exceptions
  7. import re
  8.  
  9. hrefs2 = []
  10. hrefs = []
  11. data = []
  12.  
  13. proxies = [
  14. 'socks5://46.148.112.222:1085',
  15. 'socks5://95.85.80.213:1085',
  16. 'socks5://185.101.71.76:1085',
  17. 'socks5://5.189.201.218:1085',
  18. 'socks5://141.101.201.151:1085',
  19. 'socks5://146.185.205.15:1085',
  20. 'socks5://193.106.31.11:1085',
  21. 'socks5://95.181.217.149:1085',
  22. 'socks5://5.62.152.73:1085'
  23. ]
  24.  
  25. for prox in proxies:
  26. while True:
  27. base_url = "https://www.bundes-telefonbuch.de/suche/b"
  28. url1 = 'https://www.bundes-telefonbuch.de/'
  29. session = requests.Session()
  30. try:
  31. response = session.get(base_url,proxies = dict(http = prox))
  32. except requests.exceptions.ConnectionError:
  33. continue
  34.  
  35. parsed_body = html.fromstring(response.text)
  36.  
  37. elements = parsed_body.find_class('listitempage')
  38. for element in elements:
  39. for href in element.findall('.//a'):
  40. x = href.get('href')
  41. hrefs.append(x)
  42.  
  43. for url2 in hrefs:
  44. pages_urls = url1 + str(url2)
  45. response2 = session.get(pages_urls, proxies = dict(http = prox))
  46. body = html.fromstring(response2.content.decode('utf-8'))
  47.  
  48. name = body.find_class('panel-title')
  49. for d in name:
  50. for hr in d.findall('.//a'):
  51. a = hr.get('href')
  52. hrefs2.append(a)
  53.  
  54. for url3 in hrefs2:
  55. pages_urls2 = url1 + str(url3)
  56. if len(pages_urls2)>55:
  57. response3 = session.get(pages_urls2, proxies = dict(http = prox))
  58. body2 = html.fromstring(response3.content.decode('utf-8'))
  59.  
  60. elname = body2.find_class('col-xs-12')
  61. for eln in elname:
  62. for hre in eln.findall('.//meta'):
  63. n = hre.get('content')
  64. print n
  65.  
  66.  
  67. elmail = body2.find_class('table')
  68. for e in elmail:
  69. k = list(e.itertext())
  70. for j in k:
  71. mail = re.findall(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", j)
  72. if mail:
  73. mail = mail
  74. print ';',mail
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement