readmanga.me_downloader.py

# usage: python3 readmanga.me_downloader.py <link>

import json
import requests
import os
import bs4
import re
import sys
from multiprocessing import Pool

import time


def download_manga(manga):
    print('\ndownloading...')
    path = './%s/' % manga['title']

    if not os.path.exists(path):
        os.mkdir(path)

    info_file = open(path + 'download_info.txt', 'a')

    pool = Pool(40)
    count = 0
    downloaded_pages = 0
    for volume, chapters in manga['volumes'].items():
        v_path = '%s/volume_%s' % (path, volume)

        if not os.path.exists(v_path):
            os.mkdir(v_path)

        for chapter_num, pages in chapters.items():
            ch_path = '%s/chapter_%s' % (v_path, chapter_num)
            if not os.path.exists(ch_path):
                os.mkdir(ch_path)

            pics = []
            for k, v in enumerate(pages):
                ext = v[v.rindex('.') + 1:]
                pic_name = '%s/%d.%s' % (ch_path, k, ext)
                pics.append((pic_name, v))

            # непосредственно скачивание
            result = pool.map(download_pic, pics)
            count += len(pages)

            fails = [r for r in result if r != 'ok']
            downloaded_pages += len(result) - len(fails)

            info_file.write(''.join(fails))
            sys.stdout.write('\rdownload: chapter %s. total progress: %.2f%%.' %
                             (chapter_num, float(count) / manga['count'] * 100))

            sys.stdout.flush()

    info_file.write('%d of %d pictures total were downloaded\n' % (downloaded_pages, manga['count']))
    info_file.close()


def download_pic(i):
    name = i[0]
    address = i[1]

    if os.path.exists(name):
        return 'ok'

    with open(name, 'wb') as f:
        r = requests.get(address)
        if r.ok:
            f.write(r.content)
            return 'ok'

        return 'pic %-60s was NOT downloaded. response status: %s\n' % r.status_code


# что-то хитровыебанное
def get_all_pages(chapter):
    r = requests.get('http://readmanga.me' + chapter)

    # тупо вырезаем ссылки из js скрипта
    txt = r.text[r.text.index('rm_h.init('):]
    txt = txt[:txt.index('</script>')]

    txt = txt[txt.index('[') + 1:txt.rindex(']')]

    # там такая херня вот
    # ['auto/03/21','http://e3.postfact.ru/',"/00/Wa__v01_c001_009.png_res.jpg",1100,1600]
    return [p[1] + p[0] + p[2] for p in eval(txt)]


def start(link):
    link += '/' if link[-1] != '/' else ''
    link += 'vol1/1?mature=1'
    r = requests.get(link, headers={'User-Agent': "don't block me pls"})

    if not r.ok:
        raise Exception(r.__repr__())

    print('getting manga info...')
    s = bs4.BeautifulSoup(r.text, 'html.parser')
    title = str(s.find('a', {'class': 'manga-link'}).string)

    if os.path.exists(title + '.json'):
        with open(title + '.json', 'r') as f:
            download_manga(json.loads(f.read()))
            return

    # ссылки на все главы
    data = [v.attrs['value']
            for v in s.find('select', {'id': 'chapterSelectorSelect'}).contents
            if type(v) is bs4.Tag]

    data.reverse()
    print('chapters: %d' % len(data))

    # regex 4lulz
    vol_regex = re.compile('\/vol[0-9]+\/')
    chapter_regex = re.compile('\/[0-9]+\?')

    volumes = {}

    count = 0
    for c in data:
        volume_num = int(vol_regex.findall(c)[0].lstrip('/vol').rstrip('/'))
        chapter_num = int(chapter_regex.findall(c)[0].lstrip('/').rstrip('?'))

        sys.stdout.write('\rget: volume: %3d. chapter: %4d' % (volume_num, chapter_num))
        sys.stdout.flush()

        # тянем ссылки на пикчи
        pages = get_all_pages(c)
        d = {chapter_num: pages}
        count += len(pages)

        if volume_num in volumes.keys():
            volumes[volume_num].update(d)
            continue
        volumes.update({volume_num: d})

    manga_info = {
        'title': title,
        'volumes': volumes,
        'count': count
    }

    with open(title + '.json', 'w') as f:
        f.write(json.dumps(manga_info))

    download_manga(manga_info)


if __name__ == '__main__':
    try:
        link = sys.argv[1]
    except:
        print('use: python3 readmanga.me_downloader.py <link>\n'
              'example: python3 readmanga.me_downloader.py http://readmanga.me/wa')
        exit(1)

    t = time.time()
    start(link)
    print('\n%f seconds' % (time.time() - t))