Pandaaaa906

tmall master worker version

May 15th, 2021
933
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import asyncio
  2. from asyncio import Queue
  3. from os import getenv
  4.  
  5. from loguru import logger
  6. from lxml.etree import HTML
  7. from more_itertools import first
  8. from pyppeteer import launch
  9. from pyppeteer.page import Page
  10. from pyppeteer_stealth import stealth
  11.  
  12.  
  13. product_url = 'https://detail.tmall.com/item.htm?' \
  14.               'spm=a1z10.5-b-s.w4011-21229599754.159.a43d3ab8lMMAyo&' \
  15.               'id=636149087216&rn=6d43f66ab34e0ad782135f76e059ddc7&abbucket=1'
  16. url_all_prd = 'https://skecherstx.tmall.com/?search=y'
  17. exit_flag = False
  18.  
  19.  
  20. async def parse_page(page: Page):
  21.     # 关闭登录框
  22.     try:
  23.         elem = await page.waitForXPath('//div[@class="baxia-dialog-close"]')
  24.         await elem.click()
  25.     except TimeoutError:
  26.         pass
  27.     # 点击评论
  28.     review = await page.waitForXPath('//a[text()="累计评价 "]/parent::li')
  29.     # magic sleep
  30.     await asyncio.sleep(1)
  31.     await review.click()
  32.  
  33.     while True:
  34.         await page.waitForXPath('//div[@class="rate-grid"]//tr')
  35.         # 拿整个网页的html
  36.         html = HTML(await page.content())
  37.         rows = html.xpath('//div[@class="rate-grid"]//tr')
  38.         for row in rows:
  39.             d = {
  40.                 'rate_content': first(row.xpath('.//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]//text()'), None),
  41.                 'rate_date': first(row.xpath('.//div[@class="tm-rate-date"]/text()'), None),
  42.                 'rate_reply': first(row.xpath('.//div[@class="tm-rate-reply"]//text()'), None),
  43.                 'rate_sku': ';'.join(row.xpath('.//div[@class="rate-sku"]/p/text()')),
  44.                 'rate_user': ''.join(row.xpath('.//div[@class="rate-user-info"]//text()')),
  45.             }
  46.             logger.info(d)
  47.  
  48.         next_page = await page.waitForXPath('//div[@class="rate-paginator"]/span[not(@class)]/following-sibling::a')
  49.         if not next_page:
  50.             break
  51.         await next_page.click()
  52.         pass
  53.  
  54.  
  55. async def init_page(page: Page):
  56.     await stealth(page)
  57.     await page.setViewport({
  58.         'width': 1200,
  59.         'height': 960
  60.     })
  61.     return page
  62.  
  63.  
  64. @logger.catch
  65. async def worker(browser, queue):
  66.     page = await browser.newPage()
  67.     page = await init_page(page)
  68.     logger.info(f'worker initialized')
  69.     while not exit_flag or not queue.empty():
  70.         url = await queue.get()
  71.         logger.debug(f'going to url: {url}')
  72.         await page.goto(url)
  73.         await parse_page(page)
  74.  
  75.  
  76. @logger.catch
  77. async def master(browser, queue):
  78.     global exit_flag
  79.     page = await browser.newPage()
  80.     page = await init_page(page)
  81.  
  82.     # 打开所有产品页面
  83.     await page.goto(url_all_prd)
  84.     while True:
  85.         await page.waitForXPath('//div[contains(@class, "item") and contains(@class, "line")]')
  86.  
  87.         # 提取产品链接
  88.         html = HTML(await page.content())
  89.         urls = html.xpath('//div[contains(@class, "item") and contains(@class, "line")]//dt/a/@href')
  90.  
  91.         for prd_url in urls:
  92.             await queue.put(prd_url)
  93.  
  94.         # 翻页
  95.         try:
  96.             next_page = await page.waitForXPath('//div[@class="pagination"]/a[@class="page-cur"]/following-sibling::a')
  97.         except TimeoutError:
  98.             logger.info(f'Might be detected')
  99.             break
  100.         if not next_page:
  101.             break
  102.         await next_page.click()
  103.     exit_flag = True
  104.  
  105.  
  106. async def main(headless: bool = True, n_workers: int = 1):
  107.     logger.info('Starting')
  108.     browser = await launch(
  109.         headless=headless,
  110.         args=[
  111.             '--no-sandbox',
  112.         ]
  113.     )
  114.     queue = Queue()
  115.     # 开一个master,若干worker
  116.     await asyncio.gather(
  117.         master(browser, queue),
  118.         *(worker(browser, queue) for _ in range(n_workers))
  119.     )
  120.  
  121.  
  122. if __name__ == '__main__':
  123.     n = getenv('N_WORKERS', 1)
  124.     asyncio.run(main(False, n))
  125.  
RAW Paste Data