Advertisement
Guest User

Untitled

a guest
Mar 2nd, 2018
801
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.12 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import requests.exceptions
  4. from urllib.parse import urlsplit
  5. from collections import deque
  6. import re
  7.  
  8. # a queue of urls to be crawled
  9. new_urls = deque(['YOUR_URL_HERE'])
  10.  
  11. # a set of urls that we have already crawled
  12. processed_urls = set()
  13.  
  14. # a set of crawled emails
  15. emails = set()
  16.  
  17. # process urls one by one until we exhaust the queue
  18. while len(new_urls):
  19.  
  20.     # move next url from the queue to the set of processed urls
  21.     url = new_urls.popleft()
  22.     processed_urls.add(url)
  23.  
  24.     # extract base url to resolve relative links
  25.     parts = urlsplit(url)
  26.     base_url = "{0.scheme}://{0.netloc}".format(parts)
  27.     if parts.scheme !='mailto' and parts.scheme !='#':
  28.         path = url[:url.rfind('/')+1] if '/' in parts.path else url
  29.     else:
  30.         continue
  31.  
  32.     # get url's content
  33.     print("Processing %s" % url)
  34.     try:
  35.         response = requests.get(url)
  36.     except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL):
  37.         # ignore pages with errors
  38.         continue
  39.  
  40.     # extract all email addresses and add them into the resulting set
  41.     new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
  42.     emails.update(new_emails)
  43.  
  44.     # create a beutiful soup for the html document
  45.     soup = BeautifulSoup(response.text)
  46.  
  47.     # find and process all the anchors in the document
  48.     for anchor in soup.find_all("a"):
  49.         # extract link url from the anchor
  50.         link = anchor.attrs["href"] if "href" in anchor.attrs and anchor.attrs["href"].find("mailto") ==-1 and anchor.attrs["href"].find("tel") ==-1 and anchor.attrs["href"].find("#") ==-1  else ''
  51.         # resolve relative links
  52.         if link.startswith('/'):
  53.             link = base_url + link
  54.         elif not link.startswith('http'):
  55.             link = path + link
  56.         # add the new url to the queue if it was not enqueued nor processed yet
  57.         if not link in new_urls and not link in processed_urls and not link.find('WRITE_YOUR_DOMAIN') == -1:
  58.             new_urls.append(link)
  59. print(emails)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement