Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #-*-coding:utf8;-*-
- #qpy:3
- #qpy:console
- from html.parser import HTMLParser as _HTMLParser
- from urllib.request import urlopen
- import urllib.robotparser
- from urllib.parse import urlparse, urljoin
- from json import dumps as parse_json
- import cProfile
- import mimetypes
- import time
- import math
- import random
- mimetypes.init()
- PROFILE = cProfile.Profile()
- ENABLE_PROFILING = False
- if ENABLE_PROFILING:
- PROFILE.enable()
- DEFAULT_ENCODING = 'latin-1'
- PARANT = '../'
- def profile(func):
- def wrap(*args, **kw):
- PROFILE.enable()
- return func(*args, **kw)
- PROFILE.disable()
- return wrap
- def average(lst):
- return float(sum(lst) / len(lst))
- def random_wait(mini, maxi):
- time.sleep(random.randint(mini, maxi))
- USER_AGENTS = [
- 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
- 'Opera/9.25 (Windows NT 5.1; U; en)',
- 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
- 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
- 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
- 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
- ]
- def random_ua():
- return random.choice(USER_AGENTS)
- class CrawlerClient(object):
- def __init__(self, **kw):
- self.robotparser = urllib.robotparser.RobotFileParser()
- self.ua = random_ua()
- self.referer = kw.get('referer', 'www.google.com')
- self.not_found = set()
- def can_fetch(self, url):
- return self.robotparser.can_fetch(self.ua, url)
- def get(self, url):
- self.ua = random_ua()
- req = urllib.request.Request(url)
- req.add_header('User-Agent', self.ua)
- req.add_header('Connection', 'keep-alive')
- req.add_header('Accept','text/html,xhtml,xml')
- req.add_header('Referer', self.referer)
- parsed_url = urlparse(url)
- robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
- self.robotparser.set_url(robot_file_path)
- self.robotparser.read()
- if self.can_fetch(url):
- try:
- with urlopen(req) as res:
- http_headers = res.headers
- status_code = res.getcode()
- if status_code == 404:
- self.not_found.add(url)
- return
- if status_code == 500:
- return
- content_type, *charset = http_headers.get('content-type').split(';')
- # Try to guess the charset.
- if charset:
- charset = charset[0].strip().split('=')[1]
- # Use fallback encoding.
- else:
- charset = DEFAULT_ENCODING
- # If the downloaded content is of type text/*
- # feed the content too the parser.
- if content_type.split('/')[0] == 'text':
- return res.read().decode(charset)
- except Exception as e:
- #print(e)
- pass
- class HTMLParser(_HTMLParser):
- def __init__(self, url, strict=False):
- self.url = urlparse(url)
- self.size = 0
- self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com')
- # If strict is True the parser will break
- # on broken html. Othewise it will ignore
- # broken html and keep on parsing.
- if not strict:
- self.error = self._do_nothing
- _HTMLParser.__init__(self)
- # Links holds all the links that parser finds.
- # The parser looks for links in anchor and link tags.
- self.links = set()
- self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc)
- # title will hold the value of the pages title if
- # the page has a title
- self.title = None
- # This variable lets the handle_data method know
- # that we are curently reading the title data so
- # it can store it in self.title
- self.recording_title = False
- html = self.client.get(url)
- if html:
- self.feed(html)
- def handle_starttag(self, tag, attrs):
- try:
- key, val, *_ = attrs[0]
- except IndexError:
- key, val = 0, 0
- if key == 'href':
- if val.startswith('//'):
- val = self.url.scheme + ':' + val
- url = urlparse(val)
- if not url.netloc:
- url = urlparse(urljoin(self.base_url, url.path))
- self.links.add(url.geturl())
- if tag == 'title':
- self.recording_title = True
- def handle_endtag(self, tag):
- if tag == 'title':
- self.recording_title = False
- def handle_data(self, data):
- if self.recording_title:
- self.title = data.strip()
- def _do_nothing(self, *_, **__):
- return
- class CrawlerQueue(object):
- def __init__(self, seed, **kw):
- self.seed = seed
- self.tocrawl = [seed]
- self.crawled = list()
- self.non_html_links = list()
- self.domain = urlparse(seed).netloc
- self.same_domain = kw.get('same_domain', True)
- self.exclude_parant_links = kw.get('exclude_parant_links', True)
- def next(self):
- random.shuffle(self.tocrawl)
- link = self.tocrawl.pop()
- self.crawled.append(link)
- return link
- def is_same_domain(self, link):
- return urlparse(link).netloc == self.domain
- def add_link(self, link):
- guessed_type = mimetypes.guess_type(link)[0] or 'text/html'
- if not guessed_type == 'text/html':
- return
- else:
- self.non_html_links.append(link)
- if link in self.crawled:
- return
- if self.exclude_parant_links and PARANT in link:
- return
- if not self.same_domain:
- self.tocrawl.append(link)
- else:
- if self.is_same_domain(link):
- self.tocrawl.append(link)
- def add_links(self, links):
- [self.add_link(link) for link in links]
- @property
- def total_crawled(self):
- return len(self.crawled)
- @property
- def in_queue(self):
- return len(self.tocrawl)
- @property
- def total_non_html_links(self):
- return len(self.non_html_links)
- @property
- def has_links(self):
- return bool(self.tocrawl)
- @property
- def empty(self):
- return self.has_links is False
- q = CrawlerQueue('http://reddit.com', same_domain=0)
- not_found = set([])
- while q.has_links:
- crawling = q.next()
- page = HTMLParser(crawling)
- [not_found.add(link) for link in page.client.not_found]
- q.add_links(page.links)
- title = page.title
- if title:
- print(title, not_found)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement