Untitled

#-*-coding:utf8;-*-
#qpy:3
#qpy:console

from html.parser import HTMLParser as _HTMLParser
from urllib.request import urlopen
import urllib.robotparser
from urllib.parse import urlparse, urljoin
from json import dumps as parse_json
import cProfile
import mimetypes
import time
import math
import random


mimetypes.init()

PROFILE = cProfile.Profile()


ENABLE_PROFILING = False
if ENABLE_PROFILING:
    PROFILE.enable()

DEFAULT_ENCODING = 'latin-1'

PARANT = '../'

def profile(func):
    def wrap(*args, **kw):
        PROFILE.enable()
        return func(*args, **kw)
    PROFILE.disable()
    return wrap

def average(lst):
    return float(sum(lst) /  len(lst))

def random_wait(mini, maxi):
    time.sleep(random.randint(mini, maxi))

USER_AGENTS = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]

def random_ua():
    return random.choice(USER_AGENTS)

class CrawlerClient(object):
    def __init__(self, **kw):
        self.robotparser = urllib.robotparser.RobotFileParser()
        self.ua = random_ua()
        self.referer = kw.get('referer', 'www.google.com')
        self.not_found = set()

    def can_fetch(self, url):
        return self.robotparser.can_fetch(self.ua, url)

    def get(self, url):
        self.ua = random_ua()
        req = urllib.request.Request(url)
        req.add_header('User-Agent', self.ua)
        req.add_header('Connection', 'keep-alive')
        req.add_header('Accept','text/html,xhtml,xml')
        req.add_header('Referer', self.referer)

        parsed_url = urlparse(url)
        robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
        self.robotparser.set_url(robot_file_path)
        self.robotparser.read()
        if self.can_fetch(url):
            try:
                with urlopen(req) as res:
                    http_headers = res.headers
                    status_code = res.getcode()

                    if status_code == 404:
                        self.not_found.add(url)
                        return

                    if status_code == 500:
                        return

                    content_type, *charset = http_headers.get('content-type').split(';')
                    # Try to guess the charset.
                    if charset:
                        charset = charset[0].strip().split('=')[1]
                    # Use fallback encoding.
                    else:
                        charset = DEFAULT_ENCODING

                    # If the downloaded content is of type text/*
                    # feed the content too the parser.
                    if content_type.split('/')[0] == 'text':
                        return res.read().decode(charset)
            except Exception as e:
                #print(e)
                pass


class HTMLParser(_HTMLParser):
    def __init__(self, url, strict=False):
        self.url = urlparse(url)
        self.size = 0
        self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com')
        # If strict is True the parser will break
        # on broken html. Othewise it will ignore
        # broken html and keep on parsing.
        if not strict:
            self.error = self._do_nothing

        _HTMLParser.__init__(self)
        # Links holds all the links that parser finds.
        # The parser looks for links in anchor and link tags.
        self.links = set()
        self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc)
        # title will hold the value of the pages title if
        # the page has a title
        self.title = None
        # This variable lets the handle_data method know
        # that we are curently reading the title data so
        # it can store it in self.title
        self.recording_title = False

        html = self.client.get(url)
        if html:
            self.feed(html)

    def handle_starttag(self, tag, attrs):
        try:
            key, val, *_ = attrs[0]
        except IndexError:
            key, val = 0, 0

        if key == 'href':

            if val.startswith('//'):
                val = self.url.scheme + ':' + val

            url = urlparse(val)
            if not url.netloc:
                url = urlparse(urljoin(self.base_url, url.path))
            self.links.add(url.geturl())

        if tag == 'title':
            self.recording_title = True

    def handle_endtag(self, tag):
        if tag == 'title':
            self.recording_title = False


    def handle_data(self, data):
        if self.recording_title:
            self.title = data.strip()

    def _do_nothing(self, *_,  **__):
        return


class CrawlerQueue(object):
    def __init__(self, seed, **kw):
        self.seed = seed
        self.tocrawl = [seed]
        self.crawled = list()
        self.non_html_links = list()
        self.domain = urlparse(seed).netloc
        self.same_domain = kw.get('same_domain', True)
        self.exclude_parant_links = kw.get('exclude_parant_links', True)


    def next(self):
        random.shuffle(self.tocrawl)
        link = self.tocrawl.pop()
        self.crawled.append(link)
        return link


    def is_same_domain(self, link):
        return urlparse(link).netloc == self.domain


    def add_link(self, link):
        guessed_type = mimetypes.guess_type(link)[0] or 'text/html'
        if not guessed_type == 'text/html':
            return
        else:
            self.non_html_links.append(link)

        if link in self.crawled:
            return


        if self.exclude_parant_links and PARANT in link:
            return

        if not self.same_domain:
            self.tocrawl.append(link)

        else:
            if self.is_same_domain(link):
                self.tocrawl.append(link)

    def add_links(self, links):
        [self.add_link(link) for link in links]

    @property
    def total_crawled(self):
        return len(self.crawled)

    @property
    def in_queue(self):
        return len(self.tocrawl)

    @property
    def total_non_html_links(self):
        return len(self.non_html_links)

    @property
    def has_links(self):
        return bool(self.tocrawl)

    @property
    def empty(self):
        return self.has_links is False


q = CrawlerQueue('http://reddit.com', same_domain=0)
not_found = set([])
while q.has_links:
    crawling = q.next()
    page = HTMLParser(crawling)
    [not_found.add(link) for link in page.client.not_found]
    q.add_links(page.links)
    title = page.title
    if title:
        print(title, not_found)