Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import asyncio
- import itertools
- import json
- import random
- import time
- from typing import List
- import aiohttp
- from bs4 import BeautifulSoup
- from fake_useragent import UserAgent
- def abs_url(url: str) -> str:
- # Преобразует относительный URL в абсолютный
- if url.startswith("//"):
- return f"https:{url}"
- if url.startswith("/"):
- return f"https://069.net.ua{url}"
- return url
- async def get_page_items(session: aiohttp.ClientSession, semaph: asyncio.Semaphore, page_url: str) -> List[str]:
- page_items_list: List[str] = []
- async with semaph:
- async with session.get(page_url) as resp:
- page_src = await resp.text(errors="replace")
- page_soup = BeautifulSoup(page_src, "lxml")
- header = page_soup.find("h1").text
- current_page = page_soup.find(class_="pgsn").text
- itemslist = page_soup.findAll(class_="desc")
- for el in itemslist:
- page_items_list.append(abs_url(el.a.get("href")))
- print(f"{header} {current_page} done")
- await asyncio.sleep(random.randint(1, 3))
- return page_items_list
- async def get_item_content(session: aiohttp.ClientSession, semaph: asyncio.Semaphore, item_url: str) -> None:
- async with semaph:
- await asyncio.sleep(random.randint(1, 3))
- async with session.get(item_url) as resp:
- itempage_src = await resp.text(errors="replace")
- page_soup = BeautifulSoup(itempage_src, "lxml")
- item_name = page_soup.find("h1").text
- item_code = page_soup.find_all(class_="g-r")[-2].text
- photo_link = abs_url(page_soup.find(id="s-gal-bp").img.get("src"))
- item_color = page_soup.find_all(class_="g-r")[-2].text
- size_list: List[str] = []
- for item in page_soup.find_all("option"):
- if item.text != "Виберіть":
- size_list.append(item.text)
- item_category = page_soup.find_all(itemprop="name")[1].text
- try:
- item_price = page_soup.find(class_="prcn").text
- except Exception:
- item_price = "no name"
- item_availability = "В наявності" if size_list else "Відсутній"
- item_brand = page_soup.find_all(itemprop="name")[2].text
- print()
- print("===========================================")
- print(f"назва товару - {item_name}")
- print(f"код товару - {item_code}")
- print(f"посилання на фото товару - {photo_link}")
- print(f"колір - {item_color}")
- print(f"розміри- {size_list}")
- print(f"категорія - {item_category}")
- print(f"ціна - {item_price}")
- print(f"наявність - {item_availability}")
- print(f"бренд - {item_brand}")
- print("===========================================")
- with open(f"try/{time.time()}.html", "w", encoding="utf-8") as file:
- file.write(itempage_src)
- await asyncio.sleep(random.randint(7, 11))
- async def main() -> None:
- headers = {"User-Agent": UserAgent().random}
- connector = aiohttp.TCPConnector(limit=30)
- session = aiohttp.ClientSession(headers=headers, connector=connector)
- async with session:
- # Получаем список категорий из шапки сайта
- async with session.get(abs_url("/ua/")) as resp:
- indexpage_src = await resp.text(errors="replace")
- categories_soup = BeautifulSoup(indexpage_src, "lxml").find(id="tabs")
- categories_list = categories_soup.find_all(class_="sta")
- category_dict = {}
- for category in categories_list:
- category_dict[category.text] = abs_url(category["href"])
- with open("files/Categories.json", "w", encoding="utf-8") as file:
- json.dump(category_dict, file, indent=4, ensure_ascii=False)
- # Собираем ссылки на все страницы всех категорий
- all_pages_list: List[str] = []
- for name, url in category_dict.items():
- await asyncio.sleep(1)
- async with session.get(url) as resp:
- categorypage_src = await resp.text(errors="replace")
- pagination_links = BeautifulSoup(categorypage_src, "lxml").find(id="flt-opt").find_all(class_="pgs")
- last_pageno = int(pagination_links[-2].text.strip())
- for pageno in range(1,last_pageno + 1):
- all_pages_list.append(f"{url}?page={pageno}/")
- print(f"{name} done")
- # Собираем ссылки на все товары со всех страниц
- # Скорость сбора ограничиваем семафором
- semaph = asyncio.Semaphore(15)
- tasks = [
- asyncio.create_task(get_page_items(session, semaph, page_url))
- for page_url in all_pages_list
- ]
- all_items_list = itertools.chain(*await asyncio.gather(*tasks))
- # И теперь скачиваем все товары
- tasks = [
- asyncio.create_task(get_item_content(session, semaph, item_url))
- for item_url in all_items_list
- ]
- await asyncio.gather(*tasks)
- print("Finished!")
- if __name__ == "__main__":
- asyncio.run(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement