Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import os
- import json
- import requests
- import argparse
- from bs4 import BeautifulSoup
- class nhentai(object):
- base_url = 'http://nhentai.net/'
- prefix = ''
- suffix = ''
- def __init__(self, directory='hentai', prefix='', suffix=''):
- if directory:
- if os.path.isdir(directory):
- if directory.endswith('/'):
- self.directory = directory
- else:
- self.directory = directory + '/'
- else:
- print('[-] ' + directory + ' does not exist')
- print('[*] Defaulting to hentai')
- if not os.path.isdir(directory):
- os.makedirs('hentai')
- self.directory = 'hentai/'
- else:
- self.directory = 'hentai/'
- if prefix:
- self.prefix = ''.join(i for i in prefix if i.isalnum() or i in [' ', '.', '_']).rstrip()
- if suffix:
- self.suffix = ''.join(i for i in suffix if i.isalnum() or i in [' ', '.', '_']).rstrip()
- def getData(self, url):
- data = None
- while not data:
- try:
- r = requests.get(url, timeout=10)
- data = r.text
- except Exception as e:
- print('[-] Error: ' + str(e))
- return data
- def getLastPage(self, base_url=''):
- if not base_url:
- base_url = self.base_url
- data = self.getData(base_url + '?page=1')
- soup = BeautifulSoup(data, 'html.parser')
- return int(soup.find('a', {'class': 'last'})['href'].replace('?page=', ''))
- def getGalleryIds(self, base_url='', limit=None):
- if not base_url:
- base_url = self.base_url
- ids = []
- for i in range(1, self.getLastPage(base_url) + 1):
- if limit:
- if len(ids) > limit:
- break
- print('[*] Getting gallery ids from ' + self.base_url + '?page=' + str(i), end='\r')
- data = self.getData(self.base_url + '?page=' + str(i))
- soup = BeautifulSoup(data, 'html.parser')
- for a in soup.findAll('a', {'class': 'cover'}):
- ids.append(a['href'].split('/')[2])
- if limit:
- return ids[:limit]
- return ids
- def getGalleryInfo(self, url):
- print('[*] Getting gallery info from ' + url, end='\r')
- data = self.getData(url)
- soup = BeautifulSoup(data, 'html.parser')
- gallery_info = {
- 'id': soup.find('div', {'id': 'cover'}).find('a')['href'].split('/')[2],
- 'url': 'http://' + soup.find('div', {'id': 'cover'}).find('img')['src'].replace('cover.jpg', '').replace('cover.png', '')[2:],
- 'name': soup.find('div', {'id': 'info'}).find('h1').getText().strip(),
- 'tags': [],
- 'ext': soup.find('div', {'id': 'cover'}).find('img')['src'][-3:],
- 'last': int(soup.findAll('a', {'class': 'gallerythumb'})[-1]['href'].split('/')[-2])
- }
- for div in soup.findAll('div', {'class': 'field-name'}):
- if 'Tags' in div.getText():
- for a in div.findAll('a', {'class': 'tag'}):
- gallery_info['tags'].append(a['href'][5:-1])
- return gallery_info
- def getGallery(self, url):
- gallery_info = self.getGalleryInfo(url)
- gallery = {
- 'name': gallery_info['name'],
- 'tags': gallery_info['tags'],
- 'directory': self.directory + gallery_info['id'] + '/',
- 'images': []
- }
- for i in range(1, gallery_info['last'] + 1):
- filename = str(i)
- if self.prefix:
- filename = self.prefix + '_' + filename
- if self.suffix:
- filename += '_' + self.suffix
- filename = filename + '.' + gallery_info['ext']
- image = {
- 'filename': filename,
- 'url': gallery_info['url'] + str(i) + 't.' + gallery_info['ext']
- }
- gallery['images'].append(image)
- return gallery
- def downloadImage(self, url, filename):
- if not os.path.isfile(filename):
- with open(filename, 'wb') as f:
- r = requests.get(url, stream=True)
- if not r.ok:
- print('[-] Error: something went wrong')
- else:
- for block in r.iter_content(1024):
- f.write(block)
- return True
- return False
- def downloadGallery(self, gid):
- url = self.base_url + 'g/' + str(gid) + '/'
- gallery = self.getGallery(url)
- name = gallery['name']
- tags = gallery['tags']
- directory = gallery['directory']
- images = gallery['images']
- print('[*] Downloading gallery ' + name + ' to ' + directory)
- if not os.path.exists(directory):
- os.makedirs(directory)
- with open(directory + 'info.json', 'w') as f:
- json.dump({
- 'url': url,
- 'name': name,
- 'tags': tags
- }, f)
- for image in images:
- url = image['url']
- filename = directory + image['filename']
- self.downloadImage(url, filename)
- def downloadGalleries(self, gids=[], tags=[], limit=None):
- if not gids and not tags:
- gids = self.getGalleryIds(limit=limit)
- elif not gids and tags:
- gids = []
- for tag in tags:
- for gid in self.getGalleryIds('http://nhentai.net/tag/' + tag + '/popular', limit=limit):
- gids.append(gid)
- for gid in gids:
- try:
- self.downloadGallery(gid)
- except Exception as e:
- print('[-] Error: ' + str(e))
- parser = argparse.ArgumentParser(description='Simple nhentai.net scraper')
- parser.add_argument('-g', '--gallery', help='galleries to scrape')
- parser.add_argument('-t', '--tags', help='tags to scrape')
- parser.add_argument('-l', '--limit', help='max amount of albums to download')
- parser.add_argument('-d', '--dir', default='hentai', help='directory to save to')
- parser.add_argument('-p', '--prefix', default='', help='filename prefix')
- parser.add_argument('-s', '--suffix', default='', help='filename suffix')
- args = parser.parse_args()
- nhentai = nhentai(args.dir, args.prefix, args.suffix)
- if args.limit:
- if args.limit.isnumeric() and int(args.limit) > 0:
- limit = int(args.limit)
- else:
- limit = None
- else:
- limit = None
- if args.gallery:
- if args.gallery.lower() == 'all':
- nhentai.downloadGalleries(gids=[], tags=[], limit=limit)
- else:
- gids = []
- galleries = args.gallery.split(',')
- for gallery_range in galleries:
- gallery_ranges = gallery_range.split('-')
- if len(gallery_ranges) == 1:
- if gallery_ranges[0].isnumeric() and int(gallery_ranges[0]) > 0:
- gids.append(gallery_ranges[0])
- elif len(gallery_ranges) == 2:
- if gallery_ranges[0].isnumeric() and int(gallery_ranges[0]) > 0:
- if gallery_ranges[1].isnumeric() and int(gallery_ranges[1]) > 0:
- if int(gallery_ranges[1]) > int(gallery_ranges[0]):
- min_range = int(gallery_ranges[0])
- max_range = int(gallery_ranges[1])
- else:
- min_range = int(gallery_ranges[1])
- max_range = int(gallery_ranges[0])
- for i in range(min_range, max_range + 1):
- gids.append(str(i))
- if len(gids) > 0:
- nhentai.downloadGalleries(gids=gids, tags=[], limit=limit)
- elif args.tags:
- tags = args.tags.split(',')
- nhentai.downloadGalleries(gids=[], tags=tags, limit=limit)
- else:
- print('[-] Error: either -g or -t must be passed')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement