SHARE
TWEET

Untitled

a guest Mar 2nd, 2018 490 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bs4 import BeautifulSoup
  2. import requests
  3. import requests.exceptions
  4. from urllib.parse import urlsplit
  5. from collections import deque
  6. import re
  7.  
  8. # a queue of urls to be crawled
  9. new_urls = deque(['YOUR_URL_HERE'])
  10.  
  11. # a set of urls that we have already crawled
  12. processed_urls = set()
  13.  
  14. # a set of crawled emails
  15. emails = set()
  16.  
  17. # process urls one by one until we exhaust the queue
  18. while len(new_urls):
  19.  
  20.     # move next url from the queue to the set of processed urls
  21.     url = new_urls.popleft()
  22.     processed_urls.add(url)
  23.  
  24.     # extract base url to resolve relative links
  25.     parts = urlsplit(url)
  26.     base_url = "{0.scheme}://{0.netloc}".format(parts)
  27.     if parts.scheme !='mailto' and parts.scheme !='#':
  28.         path = url[:url.rfind('/')+1] if '/' in parts.path else url
  29.     else:
  30.         continue
  31.  
  32.     # get url's content
  33.     print("Processing %s" % url)
  34.     try:
  35.         response = requests.get(url)
  36.     except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL):
  37.         # ignore pages with errors
  38.         continue
  39.  
  40.     # extract all email addresses and add them into the resulting set
  41.     new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
  42.     emails.update(new_emails)
  43.  
  44.     # create a beutiful soup for the html document
  45.     soup = BeautifulSoup(response.text)
  46.  
  47.     # find and process all the anchors in the document
  48.     for anchor in soup.find_all("a"):
  49.         # extract link url from the anchor
  50.         link = anchor.attrs["href"] if "href" in anchor.attrs and anchor.attrs["href"].find("mailto") ==-1 and anchor.attrs["href"].find("tel") ==-1 and anchor.attrs["href"].find("#") ==-1  else ''
  51.         # resolve relative links
  52.         if link.startswith('/'):
  53.             link = base_url + link
  54.         elif not link.startswith('http'):
  55.             link = path + link
  56.         # add the new url to the queue if it was not enqueued nor processed yet
  57.         if not link in new_urls and not link in processed_urls and not link.find('WRITE_YOUR_DOMAIN') == -1:
  58.             new_urls.append(link)
  59. print(emails)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top