NHScraper

#!/usr/bin/env python

import os
import json
import requests
import argparse
from bs4 import BeautifulSoup

class nhentai(object):
    base_url = 'http://nhentai.net/'
    prefix = ''
    suffix = ''

    def __init__(self, directory='hentai', prefix='', suffix=''):
        if directory:
            if os.path.isdir(directory):
                if directory.endswith('/'):
                    self.directory = directory
                else:
                    self.directory = directory + '/'
            else:
                print('[-] ' + directory + ' does not exist')
                print('[*] Defaulting to hentai')

                if not os.path.isdir(directory):
                    os.makedirs('hentai')

                self.directory = 'hentai/'
        else:
            self.directory = 'hentai/'

        if prefix:
            self.prefix = ''.join(i for i in prefix if i.isalnum() or i in [' ', '.', '_']).rstrip()

        if suffix:
            self.suffix = ''.join(i for i in suffix if i.isalnum() or i in [' ', '.', '_']).rstrip()

    def getData(self, url):
        data = None

        while not data:
            try:
                r = requests.get(url, timeout=10)
                data = r.text
            except Exception as e:
                print('[-] Error: ' + str(e))

        return data

    def getLastPage(self, base_url=''):
        if not base_url:
            base_url = self.base_url

        data = self.getData(base_url + '?page=1')
        soup = BeautifulSoup(data, 'html.parser')

        return int(soup.find('a', {'class': 'last'})['href'].replace('?page=', ''))

    def getGalleryIds(self, base_url='', limit=None):
        if not base_url:
            base_url = self.base_url

        ids = []

        for i in range(1, self.getLastPage(base_url) + 1):
            if limit:
                if len(ids) > limit:
                    break

            print('[*] Getting gallery ids from ' + self.base_url + '?page=' + str(i), end='\r')

            data = self.getData(self.base_url + '?page=' + str(i))
            soup = BeautifulSoup(data, 'html.parser')

            for a in soup.findAll('a', {'class': 'cover'}):
                ids.append(a['href'].split('/')[2])

        if limit:
            return ids[:limit]

        return ids

    def getGalleryInfo(self, url):
        print('[*] Getting gallery info from ' + url, end='\r')
        data = self.getData(url)
        soup = BeautifulSoup(data, 'html.parser')

        gallery_info = {
            'id': soup.find('div', {'id': 'cover'}).find('a')['href'].split('/')[2],
            'url': 'http://' + soup.find('div', {'id': 'cover'}).find('img')['src'].replace('cover.jpg', '').replace('cover.png', '')[2:],
            'name': soup.find('div', {'id': 'info'}).find('h1').getText().strip(),
            'tags': [],
            'ext': soup.find('div', {'id': 'cover'}).find('img')['src'][-3:],
            'last': int(soup.findAll('a', {'class': 'gallerythumb'})[-1]['href'].split('/')[-2])
        }

        for div in soup.findAll('div', {'class': 'field-name'}):
            if 'Tags' in div.getText():
                for a in div.findAll('a', {'class': 'tag'}):
                    gallery_info['tags'].append(a['href'][5:-1])

        return gallery_info

    def getGallery(self, url):
        gallery_info = self.getGalleryInfo(url)
        gallery = {
            'name': gallery_info['name'],
            'tags': gallery_info['tags'],
            'directory': self.directory + gallery_info['id'] + '/',
            'images': []
        }

        for i in range(1, gallery_info['last'] + 1):
            filename = str(i)

            if self.prefix:
                filename = self.prefix + '_' + filename

            if self.suffix:
                filename += '_' + self.suffix

            filename =  filename + '.' + gallery_info['ext']
            image = {
                'filename': filename,
                'url': gallery_info['url'] + str(i) + 't.' + gallery_info['ext']
            }
            gallery['images'].append(image)

        return gallery

    def downloadImage(self, url, filename):
        if not os.path.isfile(filename):
            with open(filename, 'wb') as f:
                r = requests.get(url, stream=True)

                if not r.ok:
                    print('[-] Error: something went wrong')
                else:
                    for block in r.iter_content(1024):
                        f.write(block)

                    return True

        return False

    def downloadGallery(self, gid):
        url = self.base_url + 'g/' + str(gid) + '/'
        gallery = self.getGallery(url)
        name = gallery['name']
        tags = gallery['tags']
        directory = gallery['directory']
        images = gallery['images']

        print('[*] Downloading gallery ' + name + ' to ' + directory)

        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(directory + 'info.json', 'w') as f:
            json.dump({
                'url': url,
                'name': name,
                'tags': tags
            }, f)

        for image in images:
            url = image['url']
            filename = directory + image['filename']
            self.downloadImage(url, filename)

    def downloadGalleries(self, gids=[], tags=[], limit=None):
        if not gids and not tags:
            gids = self.getGalleryIds(limit=limit)
        elif not gids and tags:
            gids = []

            for tag in tags:
                for gid in self.getGalleryIds('http://nhentai.net/tag/' + tag + '/popular', limit=limit):
                    gids.append(gid)

        for gid in gids:
            try:
                self.downloadGallery(gid)
            except Exception as e:
                print('[-] Error: ' + str(e))

parser = argparse.ArgumentParser(description='Simple nhentai.net scraper')
parser.add_argument('-g', '--gallery', help='galleries to scrape')
parser.add_argument('-t', '--tags', help='tags to scrape')
parser.add_argument('-l', '--limit', help='max amount of albums to download')
parser.add_argument('-d', '--dir', default='hentai', help='directory to save to')
parser.add_argument('-p', '--prefix', default='', help='filename prefix')
parser.add_argument('-s', '--suffix', default='', help='filename suffix')
args = parser.parse_args()

nhentai = nhentai(args.dir, args.prefix, args.suffix)

if args.limit:
    if args.limit.isnumeric() and int(args.limit) > 0:
        limit = int(args.limit)
    else:
        limit = None
else:
    limit = None

if args.gallery:
    if args.gallery.lower() == 'all':
        nhentai.downloadGalleries(gids=[], tags=[], limit=limit)
    else:
        gids = []
        galleries = args.gallery.split(',')
        for gallery_range in galleries:
            gallery_ranges = gallery_range.split('-')
            if len(gallery_ranges) == 1:
                if gallery_ranges[0].isnumeric() and int(gallery_ranges[0]) > 0:
                    gids.append(gallery_ranges[0])
            elif len(gallery_ranges) == 2:
                if gallery_ranges[0].isnumeric() and int(gallery_ranges[0]) > 0:
                    if gallery_ranges[1].isnumeric() and int(gallery_ranges[1]) > 0:
                        if int(gallery_ranges[1]) > int(gallery_ranges[0]):
                            min_range = int(gallery_ranges[0])
                            max_range = int(gallery_ranges[1])
                        else:
                            min_range = int(gallery_ranges[1])
                            max_range = int(gallery_ranges[0])
                        for i in range(min_range, max_range + 1):
                            gids.append(str(i))
        if len(gids) > 0:
            nhentai.downloadGalleries(gids=gids, tags=[], limit=limit)
elif args.tags:
    tags = args.tags.split(',')
    nhentai.downloadGalleries(gids=[], tags=tags, limit=limit)
else:
    print('[-] Error: either -g or -t must be passed')