Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib.request
- from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
- import re
- from multiprocessing.pool import ThreadPool
- class Crawler:
- MAX_LINKS = 50000
- def __init__(self, url, silence=False):
- self.url = self.normalize(url)
- self.host = urlparse(self.url).netloc
- self.silence = silence
- self.founded_links = []
- self.visited_links = [self.url]
- def _crawl(self, url):
- """
- Основной процесс класса. \n
- Поиск всех линков на сайте.
- """
- if not self.silence:
- print(f"{len(self.founded_links)} - Парсинг {url}")
- try:
- response = urllib.request.urlopen(url)
- except:
- print(f"Ошибка {url}")
- return
- founded_links = self.find_links(response)
- links = []
- # Фильтрация полученных URl.
- for link in founded_links:
- if 'infinite' in link:
- pass
- elif Crawler.is_url(link) and self.is_internal(link):
- links.append(link) if (link not in links) else None
- self.founded_links.append(link) if (link not in self.founded_links) else None
- # Исследование найденных URL с мультипроцессингом.
- scrap_pool = ThreadPool(15)
- scrap_pool.map(self.link_analyze, links)
- if len(self.founded_links) > 50000:
- scrap_pool.terminate()
- scrap_pool.join()
- def run(self):
- """
- Запуска кравлера.
- """
- self._crawl(self.url)
- return self.founded_links
- def is_internal(self, url):
- """
- Проверка принадлежности URL к исследуемому домену.
- """
- host = urlparse(url).netloc
- return host == self.host or host == ''
- def link_analyze(self, link):
- if link not in self.visited_links:
- link = self.normalize(link)
- self.visited_links.append(link)
- self._crawl(urljoin(self.url, link))
- @staticmethod
- def find_links(response):
- """
- Поиск url-ов на странице.
- """
- page = str(response.read())
- pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'
- founded_links = found_links = re.findall(pattern, page)
- return found_links
- @staticmethod
- def is_url(url):
- """
- Проверка корректности url.
- """
- scheme, netloc, path, qs, anchor = urlsplit(url)
- return url != '' and scheme in ['http', 'https', '']
- @staticmethod
- def normalize(url):
- """
- Нормализует полученный url.
- :param url:
- :return:
- """
- scheme, netloc, path, qs, anchor = urlsplit(url)
- return urlunsplit((scheme, netloc, path, qs, anchor))
- if __name__ == '__main__':
- c = Crawler('http://yandex.ru/')
- k = c.run()
- print(len(k))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement