#!/usr/bin/env python3 import asyncio import itertools import json import random import time from typing import List import aiohttp from bs4 import BeautifulSoup from fake_useragent import UserAgent def abs_url(url: str) -> str: # Преобразует относительный URL в абсолютный if url.startswith("//"): return f"https:{url}" if url.startswith("/"): return f"https://069.net.ua{url}" return url async def get_page_items(session: aiohttp.ClientSession, semaph: asyncio.Semaphore, page_url: str) -> List[str]: page_items_list: List[str] = [] async with semaph: async with session.get(page_url) as resp: page_src = await resp.text(errors="replace") page_soup = BeautifulSoup(page_src, "lxml") header = page_soup.find("h1").text current_page = page_soup.find(class_="pgsn").text itemslist = page_soup.findAll(class_="desc") for el in itemslist: page_items_list.append(abs_url(el.a.get("href"))) print(f"{header} {current_page} done") await asyncio.sleep(random.randint(1, 3)) return page_items_list async def get_item_content(session: aiohttp.ClientSession, semaph: asyncio.Semaphore, item_url: str) -> None: async with semaph: await asyncio.sleep(random.randint(1, 3)) async with session.get(item_url) as resp: itempage_src = await resp.text(errors="replace") page_soup = BeautifulSoup(itempage_src, "lxml") item_name = page_soup.find("h1").text item_code = page_soup.find_all(class_="g-r")[-2].text photo_link = abs_url(page_soup.find(id="s-gal-bp").img.get("src")) item_color = page_soup.find_all(class_="g-r")[-2].text size_list: List[str] = [] for item in page_soup.find_all("option"): if item.text != "Виберіть": size_list.append(item.text) item_category = page_soup.find_all(itemprop="name")[1].text try: item_price = page_soup.find(class_="prcn").text except Exception: item_price = "no name" item_availability = "В наявності" if size_list else "Відсутній" item_brand = page_soup.find_all(itemprop="name")[2].text print() print("===========================================") print(f"назва товару - {item_name}") print(f"код товару - {item_code}") print(f"посилання на фото товару - {photo_link}") print(f"колір - {item_color}") print(f"розміри- {size_list}") print(f"категорія - {item_category}") print(f"ціна - {item_price}") print(f"наявність - {item_availability}") print(f"бренд - {item_brand}") print("===========================================") with open(f"try/{time.time()}.html", "w", encoding="utf-8") as file: file.write(itempage_src) await asyncio.sleep(random.randint(7, 11)) async def main() -> None: headers = {"User-Agent": UserAgent().random} connector = aiohttp.TCPConnector(limit=30) session = aiohttp.ClientSession(headers=headers, connector=connector) async with session: # Получаем список категорий из шапки сайта async with session.get(abs_url("/ua/")) as resp: indexpage_src = await resp.text(errors="replace") categories_soup = BeautifulSoup(indexpage_src, "lxml").find(id="tabs") categories_list = categories_soup.find_all(class_="sta") category_dict = {} for category in categories_list: category_dict[category.text] = abs_url(category["href"]) with open("files/Categories.json", "w", encoding="utf-8") as file: json.dump(category_dict, file, indent=4, ensure_ascii=False) # Собираем ссылки на все страницы всех категорий all_pages_list: List[str] = [] for name, url in category_dict.items(): await asyncio.sleep(1) async with session.get(url) as resp: categorypage_src = await resp.text(errors="replace") pagination_links = BeautifulSoup(categorypage_src, "lxml").find(id="flt-opt").find_all(class_="pgs") last_pageno = int(pagination_links[-2].text.strip()) for pageno in range(1,last_pageno + 1): all_pages_list.append(f"{url}?page={pageno}/") print(f"{name} done") # Собираем ссылки на все товары со всех страниц # Скорость сбора ограничиваем семафором semaph = asyncio.Semaphore(15) tasks = [ asyncio.create_task(get_page_items(session, semaph, page_url)) for page_url in all_pages_list ] all_items_list = itertools.chain(*await asyncio.gather(*tasks)) # И теперь скачиваем все товары tasks = [ asyncio.create_task(get_item_content(session, semaph, item_url)) for item_url in all_items_list ] await asyncio.gather(*tasks) print("Finished!") if __name__ == "__main__": asyncio.run(main())