Guest User

Untitled

a guest
Sep 28th, 2024
30
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.71 KB | None | 0 0
  1. import aiohttp
  2. import asyncio
  3. from lxml import html
  4. from aiolimiter import AsyncLimiter
  5. import random
  6. import signal
  7. from log_setup import setup_logger
  8. from state_manager import create_hash, load_previous_state, save_current_state, save_progress, load_progress
  9. from utils import get_categories
  10. from upload_product_files import upload_all_product_files, load_config
  11. from save_json_file import save_to_json
  12. import cachetools
  13.  
  14.  
  15. logger = setup_logger()
  16.  
  17. headers = {
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  19. }
  20.  
  21.  
  22. # Функция для выполнения запроса с повторными попытками
  23. @cachetools.cached(cache=cachetools.TTLCache(maxsize=1000, ttl=3600))
  24. async def fetch_with_retry(session, url, max_retries=3):
  25. for attempt in range(max_retries):
  26. try:
  27. async with session.get(url, headers=headers) as response:
  28. if response.status == 200:
  29. return await response.text()
  30. elif response.status == 429:
  31. wait_time = random.uniform(0.5, 1.5)
  32. logger.warning(f"Слишком много запросов. Ожидание {wait_time:.2f} секунд...")
  33. await asyncio.sleep(wait_time)
  34. else:
  35. logger.error(f"Ошибка {response.status} при запросе {url}")
  36. except aiohttp.ClientError as e:
  37. logger.error(f"Ошибка при запросе {url}: {str(e)}")
  38.  
  39. wait_time = 2 ** attempt + random.uniform(0, 1)
  40. logger.info(f"Повторная попытка через {wait_time:.2f} секунд...")
  41. await asyncio.sleep(wait_time)
  42.  
  43. raise Exception(f"Не удалось получить данные после {max_retries} попыток")
  44.  
  45. async def scrape_products(categories):
  46. base_url = "https://megamailerdata.com"
  47. product_count = 0
  48. file_counter = 1
  49. batch_size = 20
  50. limiter = AsyncLimiter(100, 1)
  51.  
  52. state = load_previous_state()
  53. progress = load_progress()
  54.  
  55. async with aiohttp.ClientSession() as session:
  56. try:
  57. start_index = categories.index(state['last_processed_category']) if state['last_processed_category'] in categories else 0
  58.  
  59. for i in range(start_index, len(categories), batch_size):
  60. batch = categories[i:i + batch_size]
  61. logger.info(f"Обработка пакета категорий {i + 1}-{min(i + batch_size, len(categories))} из {len(categories)}")
  62.  
  63. tasks = []
  64. for category_url in batch:
  65. category_progress = progress if category_url == progress['category'] else {'category': category_url, 'page': 1, 'product_index': 0}
  66.  
  67. logger.info(f"Начало обработки категории: {category_url}")
  68. save_progress(category_progress['category'], category_progress['page'], category_progress['product_index'])
  69.  
  70. task = asyncio.create_task(scrape_category(session, category_url, base_url, file_counter, limiter, category_progress, state))
  71. tasks.append(task)
  72.  
  73. results = await asyncio.gather(*tasks, return_exceptions=True)
  74.  
  75. for category_url, result in zip(batch, results):
  76. if isinstance(result, Exception):
  77. logger.error(f"Ошибка при обработке категории {category_url}: {str(result)}")
  78. elif result is not None:
  79. category_product_count, file_counter = result
  80. product_count += category_product_count
  81. logger.info(f"Завершена обработка категории: {category_url}. Всего товаров: {product_count}")
  82.  
  83. progress['category'] = batch[-1]
  84. state['last_processed_category'] = batch[-1]
  85. state['last_processed_page'] = 1
  86. progress['page'] = 1
  87. progress['product_index'] = 0
  88. save_progress(progress['category'], progress['page'], progress['product_index'])
  89. save_current_state(state)
  90.  
  91. except KeyboardInterrupt:
  92. logger.warning("\nПарсинг остановлен пользователем. Прогресс сохранен.")
  93. save_progress(progress['category'], progress['page'], progress['product_index'])
  94. save_current_state(state)
  95.  
  96. logger.info(f"Всего обработано товаров: {product_count}")
  97.  
  98.  
  99. async def scrape_category(session, category_url, base_url, file_counter, limiter, progress, state):
  100. async with limiter:
  101. try:
  102. await asyncio.sleep(random.uniform(0.5, 1.5))
  103. html_content = await fetch_with_retry(session, category_url)
  104. tree = html.fromstring(html_content)
  105.  
  106. hierarchy = extract_hierarchy(tree)
  107. logger.info(f"Обработка категории: {hierarchy}")
  108.  
  109. products = await parse_category_page(session, tree, base_url, hierarchy, limiter, progress, state)
  110.  
  111. logger.info(f"Попытка сохранения {len(products)} продуктов для категории {hierarchy}")
  112. if products:
  113. new_file_counter = save_to_json(products, file_counter)
  114. logger.info(f"Сохранение завершено. Новый номер файла: {new_file_counter}")
  115. else:
  116. logger.warning(f"Нет продуктов для сохранения в категории {hierarchy}")
  117. new_file_counter = file_counter
  118.  
  119. # Сохраняем прогресс только после обработки всей категории
  120. progress['page'] = 1
  121. progress['product_index'] = 0
  122. save_progress(category_url, progress['page'], progress['product_index'])
  123.  
  124. # Принудительно сохраняем состояние после обработки категории
  125. save_current_state(state)
  126.  
  127. logger.info(f"Обработано товаров в категории {hierarchy}: {len(products)}")
  128. return len(products), new_file_counter
  129. except Exception as e:
  130. logger.error(f"Ошибка при обработке категории {category_url}: {str(e)}")
  131. return 0, file_counter
  132.  
  133.  
  134. def extract_hierarchy(tree):
  135. pathway_div = tree.xpath("//div[contains(@class, 'pathway')]")[0]
  136. if pathway_div is not None:
  137. hierarchy = []
  138. for link in pathway_div.xpath(".//a[contains(@class, 'pathwaylink')]"):
  139. hierarchy.append(link.text.strip())
  140. last_item = pathway_div.xpath(".//span[contains(@class, 'pathwaylink')]")
  141. if last_item:
  142. hierarchy.append(last_item[0].text.strip())
  143. return " / ".join(hierarchy)
  144. return "Нет данных"
  145.  
  146.  
  147. async def parse_category_page(session, tree, base_url, hierarchy, limiter, progress, state):
  148. product_blocks = tree.xpath("//div[contains(@class, 'stretch coin-c coin-god')]")
  149. logger.info(f"Найдено {len(product_blocks)} товаров в категории {hierarchy}")
  150.  
  151. tasks = []
  152. for block in product_blocks[progress['product_index']:]:
  153. task = asyncio.create_task(parse_product(session, block, base_url, hierarchy, limiter, progress, state))
  154. tasks.append(task)
  155.  
  156. products = await asyncio.gather(*tasks)
  157. return [product for product in products if product]
  158.  
  159.  
  160. async def parse_product(session, block, base_url, hierarchy, limiter, progress, state):
  161. try:
  162. product = {}
  163. product_name = block.xpath(".//h2/text()")[0].strip() if block.xpath(".//h2") else None
  164. img_tag = block.xpath(".//img[contains(@class, 'cboxPhoto')]")[0]
  165. product_image = img_tag.get('data-src') or img_tag.get('src') if img_tag is not None else "Нет изображения"
  166.  
  167. if not product_name or product_image == "Нет изображения":
  168. return None
  169.  
  170. product['hierarchy'] = hierarchy
  171. product['name'] = product_name
  172. product['image'] = product_image
  173.  
  174. tables = block.xpath(".//div[@class='table']")
  175. if tables:
  176. table = tables[0]
  177. rows = table.xpath(".//div[@class='tr']")
  178. product['characteristics'] = []
  179.  
  180. for row in rows:
  181. row_data = parse_row(row)
  182.  
  183. # Парсинг цен
  184. price_block = row.xpath(".//div[contains(@class, 'prices')]")
  185. if price_block:
  186. price_link = price_block[0].xpath(".//a")
  187. if price_link:
  188. price_count_text = price_link[0].xpath(".//span[2]/text()")
  189. if price_count_text:
  190. price_count_text = price_count_text[0].strip()
  191. if price_count_text.isdigit() and int(price_count_text) > 0:
  192. full_price_url = base_url + price_link[0].get('href')
  193. logger.info(f"Найдено {price_count_text} цен для товара {product_name}: {full_price_url}")
  194. async with limiter:
  195. row_data['extra_prices'] = await parse_seller_page(session, full_price_url, limiter, state)
  196. else:
  197. row_data['extra_prices'] = []
  198. row_data['prices'] = f"Цена: {price_count_text}"
  199. else:
  200. logger.warning(f"Не удалось найти текст цены для товара {product_name}")
  201. row_data['prices'] = "Цена: не найдена"
  202. else:
  203. logger.warning(f"Для товара {product_name} найден блок цен, но нет ссылки")
  204. row_data['prices'] = "Цена: 0 (нет ссылки)"
  205. else:
  206. logger.warning(f"Для товара {product_name} не найден блок цен")
  207. row_data['prices'] = "Цена: 0 (блок цен не найден)"
  208.  
  209. product['characteristics'].append(row_data)
  210.  
  211. return product
  212. except Exception as e:
  213. logger.error(f"Ошибка при обработке продукта {product_name if 'product_name' in locals() else 'Unknown'}: {str(e)}")
  214. return None
  215.  
  216. def parse_row(row):
  217. try:
  218. return {
  219. 'year': row.xpath(".//div[contains(@class, 'year')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'year')]") else "Не указано",
  220. 'sign': row.xpath(".//div[contains(@class, 'sign')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'sign')]") else "Не указано",
  221. 'desc': "".join(row.xpath(".//div[contains(@class, 'desc')]//text()")).strip() if row.xpath(".//div[contains(@class, 'desc')]") else "Не указано",
  222. 'gurt': row.xpath(".//div[contains(@class, 'gurt')]/@data-hide")[0].strip() if row.xpath(".//div[contains(@class, 'gurt')]/@data-hide") else "Не указано",
  223. 'metal': row.xpath(".//div[contains(@class, 'metal')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'metal')]") else "Не указано",
  224. 'bitkin': row.xpath(".//div[contains(@class, 'bitkin')]//a/@data-bitkin")[0].strip() if row.xpath(".//div[contains(@class, 'bitkin')]//a/@data-bitkin") else
  225. "".join(row.xpath(".//div[contains(@class, 'bitkin')]//text()")).strip() if row.xpath(".//div[contains(@class, 'bitkin')]") else "Не указано",
  226. 'adrianov': row.xpath(".//div[contains(@class, 'adrian')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'adrian')]") else "Не указано",
  227. 'fedorin': row.xpath(".//a[contains(@class, 'group-1427')]/@data-bitkin")[0] if row.xpath(".//a[contains(@class, 'group-1427')]/@data-bitkin") else "Не указано",
  228. }
  229. except Exception as e:
  230. logger.error(f"Ошибка при парсинге строки: {str(e)}")
  231. return {}
  232.  
  233.  
  234.  
  235. async def parse_seller_page(session, url, limiter, state):
  236. all_price_info = []
  237. current_page = 1
  238. total_prices = 0
  239.  
  240. while True:
  241. page_url = f"{url}?page={current_page}#sort_table" if current_page > 1 else url
  242. logger.info(f"Обработка страницы с ценами: {page_url}")
  243.  
  244. async with limiter:
  245. try:
  246. html_content = await fetch_with_retry(session, page_url)
  247. tree = html.fromstring(html_content)
  248. price_table = tree.xpath("//div[contains(@class, 'table')]")
  249.  
  250. if price_table:
  251. rows = price_table[0].xpath(".//div[contains(@class, 'tr')]")
  252.  
  253. if not rows:
  254. logger.info(f"Не найдено строк с ценами на странице {page_url}")
  255. break
  256.  
  257. for i, row in enumerate(rows):
  258. price_info = await parse_price_row_async(session, row, tree, limiter, state)
  259. if price_info:
  260. all_price_info.append(price_info)
  261. total_prices += 1
  262. else:
  263. logger.warning(f"Не найдена таблица с ценами на странице {page_url}")
  264. break
  265.  
  266. # Проверка наличия следующей страницы
  267. pagebar = tree.xpath("//div[contains(@class, 'pagebar')]")
  268. if pagebar:
  269. next_page = pagebar[0].xpath(".//a[contains(@class, 'pagebar_page next')]")
  270. if next_page:
  271. current_page += 1
  272. logger.info(f"Переход на следующую страницу: {current_page}")
  273. else:
  274. logger.info(f"Достигнута последняя страница {current_page} для {url}")
  275. break
  276. else:
  277. logger.info(f"Не найдена панель пагинации на странице {page_url}")
  278. break
  279.  
  280. except Exception as e:
  281. logger.error(f"Ошибка при обработке страницы {current_page} для {url}: {e}")
  282. break
  283.  
  284. logger.info(f"Найдено {total_prices} цен для {url}")
  285. return all_price_info
  286.  
  287. async def parse_price_row_async(session, row, tree, limiter, state):
  288. row_data = parse_price_row(row)
  289. if row_data['price'] != "Нет данных":
  290. specs_data = parse_technical_specs(tree)
  291. row_data['technical'] = specs_data['technical']
  292. row_data['literature'] = specs_data['literature']
  293.  
  294. photo_div = row.xpath(".//div[contains(@class, 'photo')]")
  295. if photo_div:
  296. lot_link = photo_div[0].xpath(".//a/@href")[0]
  297. full_lot_url = f"https://megamailerdata.com{lot_link}"
  298.  
  299. current_hash = create_hash(row_data)
  300.  
  301. # Проверяем, обрабатывали ли мы эту запись ранее
  302. if full_lot_url in state['processed_data'] and state['processed_data'][full_lot_url] == current_hash:
  303. logger.info(f"Пропуск уже обработанной записи: {full_lot_url}")
  304. return None
  305.  
  306. async with limiter:
  307. seller_description, lot_url, evaluation = await parse_seller_description(session, full_lot_url)
  308. row_data['seller_description'] = seller_description
  309. row_data['lot_url'] = lot_url
  310. row_data['evaluation'] = evaluation
  311.  
  312. # Обновляем состояние
  313. state['processed_data'][full_lot_url] = current_hash
  314. save_current_state(state)
  315.  
  316. return row_data
  317. return None
  318.  
  319.  
  320. async def parse_seller_description(session, url):
  321. try:
  322. html_content = await fetch_with_retry(session, url)
  323. tree = html.fromstring(html_content)
  324.  
  325. lot_desc = tree.xpath("//div[contains(@class, 'lot-desc')]")[0]
  326. description = "Описание продавца не найдено"
  327. if lot_desc is not None:
  328. description_elements = lot_desc.xpath(".//p|.//ul|.//li")
  329. description_parts = []
  330. for element in description_elements:
  331. if element.tag == "p":
  332. description_parts.append(element.text_content().strip())
  333. elif element.tag == "ul":
  334. for li in element.xpath(".//li"):
  335. description_parts.append(li.text_content().strip())
  336. description = "\n".join(description_parts)
  337.  
  338. evaluation = "Нет данных"
  339. price_span = tree.xpath("//span[contains(@class, 'price_orig') and @data-cache-curs]")[0]
  340. if price_span is not None:
  341. evaluation = price_span.get('data-cache-curs')
  342.  
  343. return description, url, evaluation
  344.  
  345. except Exception as e:
  346. logger.error(f"Ошибка при парсинге описания продавца для {url}: {str(e)}")
  347.  
  348. return "Описание продавца не найдено", url, "Нет данных"
  349.  
  350.  
  351. def parse_price_row(row):
  352. return {
  353. 'date': row.xpath(".//div[contains(@class, 'date')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'date')]") else "Не указано",
  354. 'seller': "".join(row.xpath(".//div[contains(@class, 'wrap-name')]//div[contains(@class, 'name')]//div/text()")).strip() if row.xpath(".//div[contains(@class, 'wrap-name')]//div[contains(@class, 'name')]") else "Не указано",
  355. 'condition': row.xpath(".//div[contains(@class, 'sohr')]/text()")[0].strip() if row.xpath(".//div[contains(@class, 'sohr')]") else "Не указано",
  356. 'price': row.xpath(".//span[contains(@class, 'price_orig')]/text()")[0].strip() if row.xpath(".//span[contains(@class, 'price_orig')]") else "Нет данных",
  357. }
  358.  
  359.  
  360. def parse_technical_specs(tree):
  361. specs_data = {'technical': [], 'literature': []}
  362. specs_block = tree.xpath("//div[contains(@class, 'specs')]")
  363. if specs_block:
  364. tech_section = specs_block[0].xpath(".//ul")
  365. if tech_section:
  366. tech_list = tech_section[0].xpath("./li")
  367. specs_data['technical'] = [li.text_content().strip() for li in tech_list]
  368.  
  369. literature_section = specs_block[0].xpath(".//ul[@id='literature']")
  370. if literature_section:
  371. literature_list = literature_section[0].xpath("./li")
  372. specs_data['literature'] = [li.text_content().strip() for li in literature_list]
  373.  
  374. return specs_data
  375.  
  376.  
  377. def signal_handler(signum, frame):
  378. raise KeyboardInterrupt()
  379.  
  380.  
  381. if __name__ == "__main__":
  382. try:
  383. categories = get_categories()
  384.  
  385. signal.signal(signal.SIGINT, signal_handler)
  386.  
  387. logger.info("Начало работы скрипта")
  388. asyncio.run(scrape_products(categories))
  389.  
  390. except KeyboardInterrupt:
  391. logger.info("Скрипт прерван пользователем. Прогресс сохранен.")
  392. except Exception as e:
  393. logger.error(f"Произошла ошибка: {str(e)}")
  394. finally:
  395. logger.info("Работа скрипта завершена")
Advertisement
Add Comment
Please, Sign In to add comment