kimpeek

Threaded Scraper

Nov 6th, 2016
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.65 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urlparse, urljoin
  4. import time
  5. import multiprocessing.dummy
  6. from multiprocessing import Queue
  7.  
  8.  
  9.  
  10. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
  11.                          'AppleWebKit/537.36 (KHTML, '
  12.                          'like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
  13. test_domains = [
  14.     'http://redditgifts.com',
  15.     'https://about.reddit.com',
  16.     'http://www.msn.com',
  17.     'https://www.reddit.com',
  18.     'http://imgur.com',
  19.     'https://i.redd.it',
  20.     'https://youtu.be',
  21.     'https://soundcloud.com',
  22.     'http://i.imgur.com',
  23.     'https://reddit.zendesk.com',
  24.     'http://www.hollywoodreporter.com',
  25.     'https://play.google.com',
  26.     'https://gfycat.com',
  27.     'https://www.youtube.com',
  28.     'https://www.redditgifts.com',
  29.     'https://en.wikipedia.org',
  30.     'https://itunes.apple.com',
  31.     'https://twitter.com',
  32.     'http://www.bbc.co.uk',
  33.     'http://uk.businessinsider.com',
  34.     'https://i.imgur.com',
  35.     'http://forward.com',
  36.     'http://www.theyucatantimes.com',
  37.     'http://nbc4i.com',
  38.     'https://i.reddituploads.com',
  39. ]
  40.  
  41.  
  42. def worker(domain):
  43.     r = requests.get(domain, headers=headers)
  44.     response = BeautifulSoup(r.text, 'lxml')
  45.     return parse(response, domain)
  46.  
  47.  
  48. def parse(response, domain):
  49.     domain_netloc = domain.split('//:')[-1].strip('www.')
  50.     extracted_domains = set()
  51.     for tag in response.find_all('a'):
  52.         href = urljoin(domain, tag.get('href'))
  53.         if href.startswith('javascript'):
  54.             continue
  55.         base_domain = _find_domain(href)
  56.         if base_domain:
  57.             if not base_domain.split('//:')[-1].strip('www.').startswith(domain_netloc):
  58.                 extracted_domains.add(base_domain)
  59.     return extracted_domains
  60.  
  61.  
  62. def _find_domain(source_url):
  63.     """
  64.    Internal function called by harvest()
  65.    Used to extract only the domain from a URL.
  66.    """
  67.  
  68.     # urlparse will consider the netloc a fragment if // not in the string
  69.     corrected_url = source_url if '//' in source_url else '//' + source_url
  70.     parts = urlparse(corrected_url, scheme='http')
  71.     domain = '{}://{}'.format(parts.scheme, parts.netloc)
  72.     if domain == 'http://':
  73.         return None
  74.     return domain
  75.  
  76.  
  77. if __name__ == '__main__':
  78.     que = Queue()
  79.     start = time.time()
  80.     size = len(test_domains)
  81.     threads = []
  82.     pool = multiprocessing.dummy.Pool(size)
  83.     data = pool.map(worker, test_domains)
  84.     for data_set in data:
  85.         for url in data_set:
  86.             que.put(url)
  87.  
  88.     print(que.qsize())
  89.     print(time.time() - start)
  90.  
  91. """
  92. 287
  93. 6.161282300949097
  94. """
Advertisement
Add Comment
Please, Sign In to add comment