Pandaaaa906

netbian

Apr 23rd, 2021
623
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import asyncio
  2. import shelve
  3. from os import path
  4. from pathlib import Path
  5.  
  6. from aiofile import async_open
  7. from httpx import AsyncClient
  8. from loguru import logger
  9. from lxml.etree import HTML
  10. from more_itertools import first
  11.  
  12. url_tmpl = 'http://www.netbian.com/index{}.htm'
  13. downloads_dir = Path('downloads')
  14. trans = str.maketrans('', '', r'\/:*?"<>|')
  15. logger.add('netbian.log', rotation='1 weeks')
  16. headers = {
  17.     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  18.                   'AppleWebKit/537.36 (KHTML, like Gecko) '
  19.                   'Chrome/89.0.4389.128 Safari/537.36'
  20. }
  21. cache = shelve.open('cache', writeback=True)
  22.  
  23.  
  24. async def download_img(client: AsyncClient, url):
  25.     r = await client.get(url)
  26.     r.encoding = 'gbk'
  27.     html = HTML(r.text)
  28.     img_url = first(html.xpath('//div[@class="pic"]//img/@src'), None)
  29.     title = first(html.xpath('//div[@class="pic"]//img/@title'), None)
  30.     if not img_url:
  31.         return
  32.     *_, ext = path.splitext(img_url)
  33.     r = await client.get(img_url)
  34.     if r.status_code != 200:
  35.         logger.warning(f'{img_url=} got wrong status code: {r.status_code}')
  36.         return
  37.     async with async_open(downloads_dir / f'{title.translate(trans)}{ext}', 'wb') as f:
  38.         async for t in r.aiter_bytes():
  39.             await f.write(t)
  40.  
  41.  
  42. async def main(start_over=False):
  43.     logger.info('Staring')
  44.     cur_page = cache.get('cur_page', 1) if not start_over else 1
  45.     cur_url = url_tmpl.format(f'_{cur_page}' if cur_page != 1 else '')
  46.     async with AsyncClient(headers=headers, timeout=30) as client:
  47.         while cur_url:
  48.             r = await client.get(cur_url)
  49.             html = HTML(r.text)
  50.             tasks = (
  51.                 asyncio.create_task(download_img(client, r.url.join(rel_url)))
  52.                 for rel_url in html.xpath('//div[@class="list"]//li/a[img]/@href')
  53.             )
  54.             await asyncio.gather(*tasks)
  55.             cur_url = (tmp := first(html.xpath('//a[@class="prev"][last()]/@href'), None)) and r.url.join(tmp)
  56.             logger.info(f'page: {cur_page} finish download')
  57.             cur_page += 1
  58.             cache['cur_page'] = cur_page
  59.     logger.info('Finished downloading')
  60.  
  61.  
  62. if __name__ == '__main__':
  63.     downloads_dir.mkdir(exist_ok=True)
  64.     asyncio.run(main())
  65.  
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×