Crawl

import requests
import urllib3
urllib3.disable_warnings()

class CrawlConfig:
    def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
        self.proxy = proxy
        self.headers = headers
        self.timeout = timeout
        self.verify = verify
        self.cookies = cookies


class Crawl:
    def __init__(self, config: CrawlConfig):
        self.set_config(config)

    def set_config(self, config: CrawlConfig):
        self.proxy = config.proxy
        self.headers = config.headers
        self.timeout = config.timeout
        self.verify = config.verify
        self.cookies = config.cookies

    def crawl(self, url):
        try:
            r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
                                    proxies=self.proxy)
            return r.content.decode()
        except:
            return ""


from enum import Enum
from urllib.parse import urlparse
import re

class RE:
    TAG = r"<(|[a-z]+)(| ([^<]+))>"
    IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
    JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
    CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
    LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"

    EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"

class ParseHTML:
    def __init__(self, base_url: str, html: str):
        self.html = html
        self.base_url = base_url
        if "https" in self.base_url:
            self.protocol = "https"
        else:
            self.protocol = "http"

    def __get_link(self, link: str) -> str:
        link = link.split("#")[0]
        if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
            return self.base_url

        if link[0:2] == "//":
            return self.protocol + ":" + link
        else:
            if link[0:1] == "/":
                return self.base_url + link
        return link


    def __get_vector(self):
        matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
        vector = {}
        for tag in [e.groups()[0] for e in matches]:
            if tag not in vector:
                vector[tag] = 1
                continue
            vector[tag] += 1

        return vector

    def __get_resource(self):
        return {
            "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
            "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
            "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
            "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
        }

    def parse(self):
        return {
            "resource": self.__get_resource(),
            "vector": self.__get_vector()
        }


if __name__ == "__main__":
    from multiprocessing import Pool
    from datetime import datetime
    pool = Pool(processes=8)
    c = Crawl(CrawlConfig())

    checked_urls = set([])
    current_urls = ["http://www.bbcamerica.com"]
    print (datetime.now())
    while len(list(checked_urls)) < 10000:
        if len(current_urls) + len(list(checked_urls)) > 10000:
            current_urls = current_urls[0: 10000 - len(list(checked_urls))]
        checked_urls.update(set(current_urls))
        contents = pool.map(c.crawl, current_urls)
        current_urls = []
        for html in contents:
            parse = ParseHTML(base_url="http://www.bbcamerica.com", html=html)
            obj = parse.parse()
            current_urls += obj['resource']['link']

        current_urls = [url for url in current_urls if url not in checked_urls]
        print ("Current len: ", len(list(checked_urls)))


    print ("Len: ", len(list(checked_urls)))

    arr = list(checked_urls)
    arr.sort()

    import json
    json.dump(arr, open("urls.json", "w"))
    print (datetime.now())