Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Easy Manga Crawler for
- # NHENTAI.NET
- # Written by 0rX
- #
- # Usage:
- # run it with python
- # from terminal
- # I haven't tried it on windows
- # because I don't use windows
- # it's useful to android users
- # with terminal emulator such as
- # termux or many other
- # which included python3 already
- #
- # if nhentai.net is blocked, please use vpn
- '''
- change maxPageThread = 10
- if you have a fast internet connection
- don't exceed 10 to avoid your ip being
- banned by the server due to your ip
- requesting too many times simultaneously
- '''
- from urllib import request as req
- import threading
- import time
- import json
- import os
- import struct
- import gzip
- import io
- import re
- import sys
- #https://nhentai.net/g/92405/6/ <- this is a link to a random hentai manga gallery, you need to replace the value of gallery with "92405" to exactly download manga from this link
- dl = True
- session_name = ""
- maxPageThread = 3
- total_page = 0
- cooldown = 0.5
- destination = ""
- series = ""
- gallery = ""
- dljob = []
- dldone = []
- dlfail = []
- def galleryFind():
- global destination
- global total_page
- global series
- global gallery
- gallery = str(input("Input manga gallery number : ")) #<- you just need to change this value
- #getting title and total page
- if True:
- pagelink = "http://nhentai.net/g/%s"% (gallery)
- reque = req.Request(pagelink, headers={'Accept-encoding':'gzip','User-Agent': 'Mozilla/5.0'})
- rawl = req.urlopen(reque)
- #print(rawl.read())
- encoding = rawl.headers.get('Content-Encoding')
- print(encoding)
- if encoding == None:
- rawl = rawl.read()
- else:
- if encoding.upper() == "GZIP":
- try:
- compressedstream = io.BytesIO(rawl.read())
- gzipper = gzip.GzipFile(fileobj=compressedstream)
- rawl = str(gzipper.read().decode('utf-8'))
- except Exception as exc:
- print(exc)
- # rawl = str(req.urlopen(pagelink).read())
- #print(rawl)
- rawl3 = rawl.split("<div>")
- rawl2 = rawl.split("</h1>")
- #print(rawl2)
- print("total item: "+str(len(rawl2)))
- rawl_list = []
- rawl3_list = []
- for li in rawl2:
- x = li.split('<div class="caption">')
- for xx in x:
- rawl_list.append(xx)
- for pg in rawl3:
- pgs = pg.split("Pages:")
- for it in pgs:
- rawl3_list.append(it)
- #print("total items in rawl_list :"+str(len(rawl3_list)))
- pfs = rawl3_list[1].split("<span class=\"name\">")
- for itemx in pfs:
- rawl3_list.append(itemx)
- #print("total items in rawl_list :"+str(len(rawl3_list)))
- #print(rawl3_list)
- genx = rawl3_list[3].split("</span>")
- title_list = rawl_list[0]
- #[print(i+"\n\n") for i in rawl3_list]
- total_page = int(genx[0])
- print("Total page :"+str(total_page))
- title1 = title_list[0:700]
- title = title1.split("content=\"")
- #print(title_list)
- title = title[2].split(" />")[0]
- #print(str(len(title)))
- #print(title1)
- artist = title_list.split("<span class=\"before\">")[1]
- #print(artist)
- #print(str(len(artist)))
- artist = artist.split("</span><span class=\"pretty\">")
- afterTitle = artist[1].split("</span><span class=\"after\">")
- titleDone = artist[0]+afterTitle[0]+afterTitle[1].split("<")[0]
- if "'" or "/" or "|" in titleDone:
- ti1 = titleDone.replace("'", "")
- ti2 = ti1.replace("/","-")
- ti2 = ti2.replace("|","+")
- ti2 = ti2.replace("\"","")
- print(ti2)
- titleDone = ti2
- series=titleDone
- print(series)
- destination = "pig/%s (%s)/"%(series,gallery)
- createDir(destination)
- urls = {}
- def createDir(destiny):
- try:
- os.mkdir(destiny)
- except Exception as e:
- print(e)
- def downloop(page):
- global dlfail
- try:
- session_name = "%s (%s)"%(series,gallery)
- if True:
- checklist = os.listdir(destination)
- if len(str(page)) == 1:
- pagenum = "00"+str(page)
- if len(str(page)) == 2:
- pagenum = "0"+str(page)
- if len(str(page)) > 2:
- pagenum = str(page)
- if pagenum+".jpg" in checklist:
- print(pagenum+".jpg Checked.")
- return
- if pagenum+".jpg" not in checklist:
- pagelink = "http://nhentai.net/g/%s/%s"% (gallery, page)
- reque = req.Request(pagelink, headers={'Accept-encoding':'gzip','User-Agent': 'Mozilla/5.0'})
- rawl = req.urlopen(reque)
- encoding = rawl.headers.get('Content-Encoding')
- #print(encoding)
- if encoding == None:
- rawl = str(rawl.read())
- else:
- if encoding.upper() == "GZIP":
- try:
- compressedstream = io.BytesIO(rawl.read())
- gzipper = gzip.GzipFile(fileobj=compressedstream)
- rawl = str(gzipper.read())
- except Exception as exc:
- print(exc)
- rawl2 = rawl.split("\\")
- rawl_list = []
- for li in rawl2:
- x = li.split("t<img ")
- for xx in x:
- rawl_list.append(xx)
- last_list = []
- #print(rawl_list)
- for i in rawl_list:
- if 'src="https://i.nhentai.net/galleries/' in i:
- last_list.append(i)
- last_link = last_list[0].split('<img src=\"')
- last = last_link
- #print("\n\n".join(last))
- linkx = last[2].split("\" ")[0]
- #print(rawl_list)
- #print(str(len(rawl_list)))
- #linkx = rawl_list[2].split("\" width")[0]
- #print(linkx)
- req.urlretrieve(linkx, "%s%s.jpg"%(destination, pagenum))
- print(pagenum+".jpg is done")
- if page in dlfail:
- dlfail.remove(page)
- print("page "+str(page)+" removed from dlfail.")
- return
- except Exception as ex:
- print(ex)
- print("Page no "+str(page)+" failed.")
- if page not in dlfail:
- dlfail.append(page)
- if pagenum+'.jpg' in os.listdir(destination):
- os.remove("%s%s.jpg"%(destination,pagenum))
- print("failed page removed.")
- '''
- createDir()
- pagerange = total_page+1
- dljob = []
- dldone=[]
- '''
- def progloop():
- global dljob
- global dldone
- global dlfail
- while True:
- pagerange = total_page+1
- print(dlfail)
- for i in range(1, pagerange):
- if i not in dldone or i in dlfail:
- if len(dljob) <= maxPageThread:
- dljob.append(i)
- threads = [threading.Thread(target=downloop, args=(page,)) for page in dljob]
- for thread in threads:
- thread.start()
- time.sleep(cooldown)
- for thread in threads:
- thread.join()
- for jobs in dljob:
- dldone.append(jobs)
- dljob.clear()
- if len(os.listdir(destination)) >= (pagerange-1):
- dldone.clear()
- print("done.")
- break
- while True:
- galleryFind()
- progloop()
- #downloop()
- print(session_name + " Done.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement