Advertisement
skip420

Web_Email_Scraper

Dec 27th, 2020
1,248
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.18 KB | None | 0 0
  1. import re
  2. import requests
  3. import requests.exceptions
  4. from urllib.parse import urlsplit
  5. from collections import deque
  6. from bs4 import BeautifulSoup
  7.  
  8. # starting url. replace google with your own url.
  9. starting_url = 'http://www.miet.ac.in'
  10.  
  11. # a queue of urls to be crawled
  12. unprocessed_urls = deque([starting_url])
  13.  
  14. # set of already crawled urls for email
  15. processed_urls = set()
  16.  
  17. # a set of fetched emails
  18. emails = set()
  19.  
  20. # process urls one by one from unprocessed_url queue until queue is empty
  21. while len(unprocessed_urls):
  22.  
  23.     # move next url from the queue to the set of processed urls
  24.     url = unprocessed_urls.popleft()
  25.     processed_urls.add(url)
  26.  
  27.     # extract base url to resolve relative links
  28.     parts = urlsplit(url)
  29.     base_url = "{0.scheme}://{0.netloc}".format(parts)
  30.     path = url[:url.rfind('/')+1] if '/' in parts.path else url
  31.  
  32.     # get url's content
  33.     print("Crawling URL %s" % url)
  34.     try:
  35.         response = requests.get(url)
  36.     except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
  37.         # ignore pages with errors and continue with next url
  38.         continue
  39.  
  40.     # extract all email addresses and add them into the resulting set
  41.     # You may edit the regular expression as per your requirement
  42.     new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
  43.     emails.update(new_emails)
  44.     print(emails)
  45.     # create a beutiful soup for the html document
  46.     soup = BeautifulSoup(response.text, 'lxml')
  47.  
  48.     # Once this document is parsed and processed, now find and process all the anchors i.e. linked urls in this document
  49.     for anchor in soup.find_all("a"):
  50.         # extract link url from the anchor
  51.         link = anchor.attrs["href"] if "href" in anchor.attrs else ''
  52.         # resolve relative links (starting with /)
  53.         if link.startswith('/'):
  54.             link = base_url + link
  55.         elif not link.startswith('http'):
  56.             link = path + link
  57.         # add the new url to the queue if it was not in unprocessed list nor in processed list yet
  58.         if not link in unprocessed_urls and not link in processed_urls:
  59.             unprocessed_urls.append(link)
  60.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement