Advertisement
Guest User

Untitled

a guest
Feb 13th, 2016
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.00 KB | None | 0 0
  1. import aiohttp
  2. import re
  3. import bs4
  4. import os
  5. import sys
  6. import asyncio
  7.  
  8. max_connection = 10
  9. loop = asyncio.get_event_loop()
  10. sem = asyncio.Semaphore(value=max_connection)
  11.  
  12. async def request_and_write(path, img_name, img_request):
  13. """
  14. Get the image from request and write to disk
  15. :param path: absolute path to folder
  16. :param img_name: image name
  17. :param img_request: the image request to get data from
  18. :return: None
  19. """
  20. os.makedirs(path, exist_ok=True)
  21. with open(os.path.join(path, img_name), 'wb') as image_file:
  22. while True:
  23. chunk = await img_request.content.read(100000)
  24. if not chunk:
  25. break
  26. image_file.write(chunk)
  27. print('===> Done: {0} !'.format(img_name))
  28.  
  29. async def get_one_chapter(link, path):
  30. """
  31. Download one chapter ~ 20-25 .jpg images
  32. :param link: the link to one chapter
  33. :param path:
  34. :return: None
  35. """
  36. async with aiohttp.ClientSession(loop=loop).get(link) as res_local:
  37. assert res_local.status == 200
  38. html_local = await res_local.read()
  39. soup = bs4.BeautifulSoup(html_local, 'html.parser')
  40. imgRegex = re.compile('.*.jpg')
  41.  
  42. # extract the src links
  43. for img in soup.select("a.img-link img[src]"):
  44. imgLink = img['src']
  45. img_name = re.search(imgRegex, os.path.basename(imgLink)).group()
  46. #request the image
  47. async with sem:
  48. async with aiohttp.ClientSession(loop=loop).get(imgLink) as img_res:
  49. await request_and_write(path, img_name, img_res)
  50.  
  51. async def get_all(seed, manga_name, folder):
  52. """
  53. Performs downloading all chapters of each version of the manga
  54. :param seed: the seed link (http://mangapark.me/manga/berserk)
  55. :param manga_name: manga name (Berserk)
  56. :param folder: save destination
  57. :return: None
  58. """
  59. async with aiohttp.ClientSession(loop=loop).get(seed) as res:
  60. assert res.status == 200
  61. html = await res.read()
  62. soup = bs4.BeautifulSoup(html, 'html.parser')
  63.  
  64. # get List of versions
  65. versions = [ver for ver in soup.select('a.st')]
  66. print('There are %s versions' %len(versions))
  67. versionTags = ['s%s' %(x+1) for x in range(len(versions))]
  68. print(versionTags)
  69.  
  70. #start downloading all chapters
  71. for link in soup.find_all('a', target = '_blank', text = re.compile('all')):
  72. #link to the 'all' button in web page
  73. chapLink = 'http://mangapark.me' + link.get('href')
  74. chapName = os.path.basename(chapLink)
  75.  
  76. # iterate through each version in all links
  77. for ver in versionTags:
  78. if ver in chapLink:
  79. # download
  80. filePath = os.path.join(folder, manga_name, ver, chapName)
  81. await get_one_chapter(chapLink, filePath)
  82. print('@@@@@ DONE CHAPTER {1} OF VERSION {0}'.format(ver, chapName))
  83.  
  84.  
  85. # main operation
  86. script, seed_link, manga_name, folder = sys.argv
  87. loop.run_until_complete(get_all(seed_link, manga_name, folder))
  88. loop.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement