Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import aiohttp
- import re
- import bs4
- import os
- import sys
- import asyncio
- max_connection = 10
- loop = asyncio.get_event_loop()
- sem = asyncio.Semaphore(value=max_connection)
- async def request_and_write(path, img_name, img_request):
- """
- Get the image from request and write to disk
- :param path: absolute path to folder
- :param img_name: image name
- :param img_request: the image request to get data from
- :return: None
- """
- os.makedirs(path, exist_ok=True)
- with open(os.path.join(path, img_name), 'wb') as image_file:
- while True:
- chunk = await img_request.content.read(100000)
- if not chunk:
- break
- image_file.write(chunk)
- print('===> Done: {0} !'.format(img_name))
- async def get_one_chapter(link, path):
- """
- Download one chapter ~ 20-25 .jpg images
- :param link: the link to one chapter
- :param path:
- :return: None
- """
- async with aiohttp.ClientSession(loop=loop).get(link) as res_local:
- assert res_local.status == 200
- html_local = await res_local.read()
- soup = bs4.BeautifulSoup(html_local, 'html.parser')
- imgRegex = re.compile('.*.jpg')
- # extract the src links
- for img in soup.select("a.img-link img[src]"):
- imgLink = img['src']
- img_name = re.search(imgRegex, os.path.basename(imgLink)).group()
- #request the image
- async with sem:
- async with aiohttp.ClientSession(loop=loop).get(imgLink) as img_res:
- await request_and_write(path, img_name, img_res)
- async def get_all(seed, manga_name, folder):
- """
- Performs downloading all chapters of each version of the manga
- :param seed: the seed link (http://mangapark.me/manga/berserk)
- :param manga_name: manga name (Berserk)
- :param folder: save destination
- :return: None
- """
- async with aiohttp.ClientSession(loop=loop).get(seed) as res:
- assert res.status == 200
- html = await res.read()
- soup = bs4.BeautifulSoup(html, 'html.parser')
- # get List of versions
- versions = [ver for ver in soup.select('a.st')]
- print('There are %s versions' %len(versions))
- versionTags = ['s%s' %(x+1) for x in range(len(versions))]
- print(versionTags)
- #start downloading all chapters
- for link in soup.find_all('a', target = '_blank', text = re.compile('all')):
- #link to the 'all' button in web page
- chapLink = 'http://mangapark.me' + link.get('href')
- chapName = os.path.basename(chapLink)
- # iterate through each version in all links
- for ver in versionTags:
- if ver in chapLink:
- # download
- filePath = os.path.join(folder, manga_name, ver, chapName)
- await get_one_chapter(chapLink, filePath)
- print('@@@@@ DONE CHAPTER {1} OF VERSION {0}'.format(ver, chapName))
- # main operation
- script, seed_link, manga_name, folder = sys.argv
- loop.run_until_complete(get_all(seed_link, manga_name, folder))
- loop.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement