Easy_Nhentai_crawler_fix

# Easy Manga Crawler for
# NHENTAI.NET
# Written by 0rX
#
# Usage:
#    run it with python
#   from terminal
#   I haven't tried it on windows
#    because I don't use windows
#    it's useful to android users
#    with terminal emulator such as
#    termux or many other
#   which included python3 already
#
#    if nhentai.net is blocked, please use vpn

'''
change maxPageThread = 10
if you have a fast internet connection
don't exceed 10 to avoid your ip being
banned by the server due to your ip
requesting too many times simultaneously

'''


from urllib import request as req
import threading
import time
import json
import os
import struct
import gzip
import io
import re
import sys

#https://nhentai.net/g/92405/6/  <- this is a link to a random hentai manga gallery, you need to replace the value of gallery with "92405" to exactly download manga from this link

dl = True
session_name = ""
maxPageThread = 3
total_page = 0
cooldown = 0.5
destination = ""
series = ""
gallery = ""
dljob = []
dldone = []
dlfail = []

def galleryFind():
 global destination
 global total_page
 global series
 global gallery
 gallery   =  str(input("Input manga gallery number :  ")) #<- you just need to change this value

#getting title and total page
 if True:
                        pagelink = "http://nhentai.net/g/%s"% (gallery)
                        reque = req.Request(pagelink, headers={'Accept-encoding':'gzip','User-Agent': 'Mozilla/5.0'})
                        rawl = req.urlopen(reque)
                        #print(rawl.read())
                        encoding = rawl.headers.get('Content-Encoding')
                        print(encoding)
                        if encoding == None:
                            rawl = rawl.read()
                        else:
                            if encoding.upper() == "GZIP":
                                try:
                                    compressedstream = io.BytesIO(rawl.read())
                                    gzipper = gzip.GzipFile(fileobj=compressedstream)
                                    rawl = str(gzipper.read().decode('utf-8'))
                                except Exception as exc:
                                    print(exc)
#                        rawl = str(req.urlopen(pagelink).read())
                        #print(rawl)
                        rawl3 = rawl.split("<div>")
                        rawl2 = rawl.split("</h1>")
                        #print(rawl2)
                        print("total item: "+str(len(rawl2)))
                        rawl_list = []
                        rawl3_list = []
                        for li in rawl2:
                          x = li.split('<div class="caption">')
                          for xx in x:
                            rawl_list.append(xx)
                        for pg in rawl3:
                            pgs = pg.split("Pages:")
                            for it in pgs:
                                rawl3_list.append(it)
                        #print("total items in rawl_list :"+str(len(rawl3_list)))
                        pfs = rawl3_list[1].split("<span class=\"name\">")
                        for itemx in pfs:
                                rawl3_list.append(itemx)
                        #print("total items in rawl_list :"+str(len(rawl3_list)))
                        #print(rawl3_list)
                        genx = rawl3_list[3].split("</span>")
                        title_list = rawl_list[0]
                        #[print(i+"\n\n") for i in rawl3_list]
                        total_page =  int(genx[0])
                        print("Total page :"+str(total_page))
                        title1 = title_list[0:700]
                        title = title1.split("content=\"")
                        #print(title_list)
                        title = title[2].split(" />")[0]
                        #print(str(len(title)))
                        #print(title1)
                        artist = title_list.split("<span class=\"before\">")[1]
                        #print(artist)
                        #print(str(len(artist)))
                        artist = artist.split("</span><span class=\"pretty\">")
                        afterTitle = artist[1].split("</span><span class=\"after\">")
                        titleDone = artist[0]+afterTitle[0]+afterTitle[1].split("<")[0]
                        if "&#x27;" or "/" or "|" in titleDone:
                            ti1 = titleDone.replace("&#x27;", "")
                            ti2 = ti1.replace("/","-")
                            ti2 = ti2.replace("|","+")
                            ti2 = ti2.replace("\"","")
                            print(ti2)
                            titleDone = ti2
                        series=titleDone
                        print(series)

 destination = "pig/%s (%s)/"%(series,gallery)
 createDir(destination)

 urls = {}

def createDir(destiny):
    try:
        os.mkdir(destiny)
    except Exception as e:
        print(e)


def downloop(page):
   global dlfail
   try:
    session_name = "%s (%s)"%(series,gallery)
    if True:
                checklist = os.listdir(destination)
                if len(str(page)) == 1:
                    pagenum = "00"+str(page)
                if len(str(page)) == 2:
                    pagenum = "0"+str(page)
                if len(str(page)) > 2:
                    pagenum = str(page)
                if pagenum+".jpg" in checklist:
                        print(pagenum+".jpg Checked.")

                        return
                if pagenum+".jpg" not in checklist:
                        pagelink = "http://nhentai.net/g/%s/%s"% (gallery, page)
                        reque = req.Request(pagelink, headers={'Accept-encoding':'gzip','User-Agent': 'Mozilla/5.0'})
                        rawl = req.urlopen(reque)
                        encoding = rawl.headers.get('Content-Encoding')
                        #print(encoding)
                        if encoding == None:
                            rawl = str(rawl.read())
                        else:
                            if encoding.upper() == "GZIP":
                                try:
                                    compressedstream = io.BytesIO(rawl.read())
                                    gzipper = gzip.GzipFile(fileobj=compressedstream)
                                    rawl = str(gzipper.read())
                                except Exception as exc:
                                    print(exc)
                        rawl2 = rawl.split("\\")
                        rawl_list = []
                        for li in rawl2:
                          x = li.split("t<img ")
                          for xx in x:
                            rawl_list.append(xx)
                        last_list = []
                        #print(rawl_list)

                        for i in rawl_list:
                            if 'src="https://i.nhentai.net/galleries/' in i:
                                last_list.append(i)
                        last_link = last_list[0].split('<img src=\"')
                        last = last_link
                        #print("\n\n".join(last))
                        linkx = last[2].split("\" ")[0]
                        #print(rawl_list)
                        #print(str(len(rawl_list)))
                        #linkx = rawl_list[2].split("\" width")[0]
                        #print(linkx)
                        req.urlretrieve(linkx, "%s%s.jpg"%(destination, pagenum))
                        print(pagenum+".jpg is done")
                        if page in dlfail:
                            dlfail.remove(page)
                            print("page "+str(page)+" removed from dlfail.")
                        return
   except Exception as ex:
         print(ex)
         print("Page no "+str(page)+" failed.")
         if page not in dlfail:
            dlfail.append(page)
            if pagenum+'.jpg' in os.listdir(destination):
                os.remove("%s%s.jpg"%(destination,pagenum))
                print("failed page removed.")
'''
createDir()
pagerange = total_page+1
dljob = []
dldone=[]
'''

def progloop():
    global dljob
    global dldone
    global dlfail
    while True:
        pagerange = total_page+1
        print(dlfail)
        for i in range(1, pagerange):
            if i not in dldone or i in dlfail:
                if len(dljob) <= maxPageThread:
                    dljob.append(i)
        threads = [threading.Thread(target=downloop, args=(page,)) for page in dljob]
        for thread in threads:
            thread.start()
            time.sleep(cooldown)
        for thread in threads:
            thread.join()
            for jobs in dljob:
                dldone.append(jobs)
            dljob.clear()
        if len(os.listdir(destination)) >= (pagerange-1):
            dldone.clear()
            print("done.")
            break

while True:
    galleryFind()
    progloop()

#downloop()
print(session_name + " Done.")