Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import re
- import bs4
- def getpage(url):
- """
- Get an HTML of a page, using human-like User-Agent to avoid being detected as a bot.
- returns html
- """
- headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
- response = requests.get(url, headers=headers)
- return response.content
- def prendi_email(url):
- html = getpage(url)
- m = re.findall(r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])", html)
- if m:
- return m[0]
- def lista_psicologi(url):
- html = getpage(url)
- lst = []
- soup = bs4.BeautifulSoup(html, "lxml")
- for link in soup.findAll('a'):
- if '/psicologi-psicoterapeuti/' in link.get('href'):
- lst.append(link.get('href'))
- return list(set(lst))
- if __name__ == '__main__':
- # per idee di business, gestionali e/o servizi mirati a psicologi, mailami a giorgio@diguardia.org
- lista = lista_psicologi("https://www.psicologionline.net/psicologi-psicoterapeuti/psicologo-campania/15?min_pager=0")
- for tizio in lista:
- print prendi_email("https://www.psicologionline.net" + str(tizio))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement