import requests
import urllib3
urllib3.disable_warnings()
class CrawlConfig:
def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
self.proxy = proxy
self.headers = headers
self.timeout = timeout
self.verify = verify
self.cookies = cookies
class Crawl:
def __init__(self, config: CrawlConfig):
self.set_config(config)
def set_config(self, config: CrawlConfig):
self.proxy = config.proxy
self.headers = config.headers
self.timeout = config.timeout
self.verify = config.verify
self.cookies = config.cookies
def crawl(self, url):
try:
r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
proxies=self.proxy)
return r.content.decode()
except:
return ""
from enum import Enum
from urllib.parse import urlparse
import re
class RE:
TAG = r"<(|[a-z]+)(| ([^<]+))>"
IMG = r"
]+)"
JS = r"