Advertisement
Guest User

Untitled

a guest
Mar 30th, 2017
514
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.56 KB | None | 0 0
  1. import requests
  2. import re
  3. import bs4
  4.  
  5. def getpage(url):
  6.     """
  7.    Get an HTML of a page, using human-like User-Agent to avoid being detected as a bot.
  8.    returns html
  9.    """
  10.     headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
  11.     response = requests.get(url, headers=headers)
  12.     return response.content
  13.  
  14. def prendi_email(url):
  15.     html = getpage(url)
  16.     m = re.findall(r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])", html)
  17.     if m:
  18.         return m[0]
  19.  
  20. def lista_psicologi(url):
  21.     html = getpage(url)
  22.     lst = []
  23.     soup = bs4.BeautifulSoup(html, "lxml")
  24.     for link in soup.findAll('a'):
  25.         if '/psicologi-psicoterapeuti/' in link.get('href'):
  26.             lst.append(link.get('href'))
  27.     return list(set(lst))
  28.  
  29. if __name__ == '__main__':
  30.     # per idee di business, gestionali e/o servizi mirati a psicologi, mailami a giorgio@diguardia.org
  31.     lista = lista_psicologi("https://www.psicologionline.net/psicologi-psicoterapeuti/psicologo-campania/15?min_pager=0")
  32.     for tizio in lista:
  33.         print prendi_email("https://www.psicologionline.net" + str(tizio))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement