Pandaaaa906

tmall_comment

May 15th, 2021
642
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import asyncio
  2.  
  3. from lxml.etree import HTML
  4. from more_itertools import first
  5. from pyppeteer import launch
  6. from pyppeteer_stealth import stealth
  7.  
  8.  
  9. url = 'https://detail.tmall.com/item.htm?' \
  10.       'spm=a1z10.5-b-s.w4011-21229599754.159.a43d3ab8lMMAyo&' \
  11.       'id=636149087216&rn=6d43f66ab34e0ad782135f76e059ddc7&abbucket=1'
  12.  
  13.  
  14. async def main():
  15.     browser = await launch(
  16.         headless=False,
  17.         args=[
  18.             '--no-sandbox',
  19.         ]
  20.     )
  21.     page = await browser.newPage()
  22.     await stealth(page)
  23.     await page.goto(url)
  24.  
  25.     # 关闭登录框
  26.     try:
  27.         elem = await page.waitForXPath('//div[@class="baxia-dialog-close"]')
  28.         await elem.click()
  29.     except TimeoutError:
  30.         pass
  31.     # 点击评论
  32.     review = await page.waitForXPath('//a[text()="累计评价 "]/parent::li')
  33.     # magic sleep
  34.     await asyncio.sleep(1)
  35.     await review.click()
  36.  
  37.     while True:
  38.         await page.waitForXPath('//div[@class="rate-grid"]//tr')
  39.         # 拿整个网页的html
  40.         html = HTML(await page.content())
  41.         rows = html.xpath('//div[@class="rate-grid"]//tr')
  42.         for row in rows:
  43.             d = {
  44.                 'rate_content': first(row.xpath('.//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]//text()'), None),
  45.                 'rate_date': first(row.xpath('.//div[@class="tm-rate-date"]/text()'), None),
  46.                 'rate_reply': first(row.xpath('.//div[@class="tm-rate-reply"]//text()'), None),
  47.                 'rate_sku': ';'.join(row.xpath('.//div[@class="rate-sku"]/p/text()')),
  48.                 'rate_user': ''.join(row.xpath('.//div[@class="rate-user-info"]//text()')),
  49.             }
  50.             print(d)
  51.  
  52.         next_page = await page.waitForXPath('//div[@class="rate-paginator"]/span[not(@class)]/following-sibling::a')
  53.         if not next_page:
  54.             break
  55.         await next_page.click()
  56.         pass
  57.  
  58.  
  59. if __name__ == '__main__':
  60.     asyncio.run(main())
  61.  
RAW Paste Data