Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from urllib.parse import urlparse, urljoin
- import time
- import multiprocessing.dummy
- from multiprocessing import Queue
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
- 'AppleWebKit/537.36 (KHTML, '
- 'like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
- test_domains = [
- 'http://redditgifts.com',
- 'https://about.reddit.com',
- 'http://www.msn.com',
- 'https://www.reddit.com',
- 'http://imgur.com',
- 'https://i.redd.it',
- 'https://youtu.be',
- 'https://soundcloud.com',
- 'http://i.imgur.com',
- 'https://reddit.zendesk.com',
- 'http://www.hollywoodreporter.com',
- 'https://play.google.com',
- 'https://gfycat.com',
- 'https://www.youtube.com',
- 'https://www.redditgifts.com',
- 'https://en.wikipedia.org',
- 'https://itunes.apple.com',
- 'https://twitter.com',
- 'http://www.bbc.co.uk',
- 'http://uk.businessinsider.com',
- 'https://i.imgur.com',
- 'http://forward.com',
- 'http://www.theyucatantimes.com',
- 'http://nbc4i.com',
- 'https://i.reddituploads.com',
- ]
- def worker(domain):
- r = requests.get(domain, headers=headers)
- response = BeautifulSoup(r.text, 'lxml')
- return parse(response, domain)
- def parse(response, domain):
- domain_netloc = domain.split('//:')[-1].strip('www.')
- extracted_domains = set()
- for tag in response.find_all('a'):
- href = urljoin(domain, tag.get('href'))
- if href.startswith('javascript'):
- continue
- base_domain = _find_domain(href)
- if base_domain:
- if not base_domain.split('//:')[-1].strip('www.').startswith(domain_netloc):
- extracted_domains.add(base_domain)
- return extracted_domains
- def _find_domain(source_url):
- """
- Internal function called by harvest()
- Used to extract only the domain from a URL.
- """
- # urlparse will consider the netloc a fragment if // not in the string
- corrected_url = source_url if '//' in source_url else '//' + source_url
- parts = urlparse(corrected_url, scheme='http')
- domain = '{}://{}'.format(parts.scheme, parts.netloc)
- if domain == 'http://':
- return None
- return domain
- if __name__ == '__main__':
- que = Queue()
- start = time.time()
- size = len(test_domains)
- threads = []
- pool = multiprocessing.dummy.Pool(size)
- data = pool.map(worker, test_domains)
- for data_set in data:
- for url in data_set:
- que.put(url)
- print(que.qsize())
- print(time.time() - start)
- """
- 287
- 6.161282300949097
- """
Advertisement
Add Comment
Please, Sign In to add comment