Pastebin PRO Accounts EASTER SPECIAL! For a limited time only get 40% discount on a LIFETIME PRO account! Offer Ends April 2nd!
SHARE
TWEET
imcumming
a guest
Apr 1st, 2015
240
Never
- #!/usr/bin/env python3
- import os
- import requests
- import time
- import sys
- import re
- import imghdr
- import unicodedata
- from bs4 import BeautifulSoup
- class BadStatus(Exception):
- '''Exception for when http status is not 200. Takes the status as required input.'''
- def __init__(self, status):
- self.status = status
- def __str__(self):
- return repr(self.status)
- def GetGallery(gallery_url): #Takes url string
- fail = 0
- while fail < 3:
- try:
- r = s.get(gallery_url, cookies=cookies, headers=headers)
- break
- except requests.exceptions.RequestException:
- fail += 1
- print("Failed to load page (attempt %d / 3). Retrying in 10s..." % fail)
- time.sleep(10)
- if fail == 3:
- sys.exit("Fatal error: Exceeded max retry attempts for page load")
- return r
- def PageInfo(gallery_url): #Takes url string
- if re.match('.*\/s\/.*', gallery_url):
- return "Page"
- elif re.match('.*\/g\/.*', gallery_url):
- return "Gallery"
- else:
- return "Overview"
- def DownloadImage(image_url, image_name): #Takes strings
- '''Downloads an image. Takes a dictionary containing image name as key and URL as value.'''
- global directory
- global ovdirectory
- r = s.get(image_url, cookies=cookies, headers=headers, stream=True)
- size = int(r.headers['content-length'])
- fulldir = ovdirectory + directory
- if r.status_code == 200:
- with open(os.path.join(fulldir, image_name), 'wb') as f:
- for chunk in r.iter_content(1024):
- f.write(chunk)
- else:
- raise BadStatus(r.status_code)
- try:
- if image_name[-3:] != imghdr.what(directory + '/' + image_name):
- os.rename(directory + '/' + image_name, directory + '/' + image_name[:-3] + imghdr.what(directory + '/' + image_name))
- except TypeError:
- print("OH SHIT")
- def GetImageURL(gallery): #Takes Beautifulsoup object
- fullimg = gallery.findAll('a', href=re.compile('^http://exhentai.org/fullimg.php'))
- image_url = ""
- for links in fullimg:
- fimglink = links.get('href')
- try:
- fimglink
- except NameError:
- image_url = gallery.find(id="img")['src']
- print("Not high res - using H@H")
- else:
- if fimglink != []:
- image_url = fimglink
- print("High res - using ExH servers")
- else:
- image_url = gallery.find(id="img")['src']
- print("Not high res - using H@H")
- return image_url
- def SetDirectory(gallery):
- global directory
- directory = gallery.find('title').string
- if not os.path.isdir(ovdirectory + directory):
- os.makedirs(ovdirectory + directory)
- def PageLoop(gallery): #Takes Beautifulsoup object
- global stop
- imagenum = 0
- SetDirectory(gallery)
- while gallery:
- name = FindName(gallery)
- print(name)
- imgurl = GetImageURL(gallery)
- print(imgurl)
- try:
- DownloadImage(imgurl, name)
- except (requests.exceptions.RequestException, BadStatus):
- print("Error: Could not download image. Will try again later.")
- failnames.append(name)
- failurls.append(imgurl)
- gallery = NextGallery(gallery)
- imagenum += 1
- if imagenum == stop:
- gallery = False
- for i in range(0, len(failnames)):
- try:
- DownloadImage(failurls[i], failnames[i])
- except (requests.exceptions.RequestException, BadStatus):
- print("Error: Could not download image. Will try again later.")
- death.append(failnames[i] + " - " + failurls[i])
- print(death)
- def NextGallery(page): #Takes a Beautifulsoup object
- '''Ensures a next page exists and if so returns the URL, else False'''
- new = page.find(id="next")['href']
- newpage = BeautifulSoup(GetGallery(new).text)
- if newpage != page:
- return newpage
- else:
- return False
- def FindName(imagepage): #Takes Beautifulsoup object
- imgname = imagepage.findAll('div', {'id' : 'i4'})[0].find('div').string
- imgname = re.sub('\ ::\ .*', '', imgname)
- return imgname
- def NextOverview(page): #Takes a Beautifulsoup object
- npitem = page.findAll('a', text=re.compile('>'))
- try:
- newpageurl = npitem[0].get('href')
- newpage = BeautifulSoup(GetGallery(newpageurl).text)
- return newpage
- except IndexError:
- return False
- def GetGalleryURLs(overview):
- while overview:
- galinfos = overview.findAll('div', {'class' : 'it5'})
- urls = []
- for i in galinfos:
- a = i.find('a')
- urls.append(a['href'])
- overview = NextOverview(overview)
- print(urls)
- return urls
- def DownloadGallery(gallery):
- first = gallery.find('a', href=re.compile('^http://exhentai.org/s/'))
- furl = first.get('href')
- PageLoop(BeautifulSoup(GetGallery(furl).text))
- global directory
- global ovdirectory
- global stop
- failnames = []
- failurls = []
- death = []
- directory = ''
- ovdirectory = ''
- cmdurl = ""
- gallery_url = ''
- try:
- gallery_url = sys.argv[1]
- except IndexError:
- print("Supply me a URL, come on")
- exit(0)
- try:
- stop = sys.argv[2]
- except IndexError:
- stop = 999999
- try:
- ovdirectory = sys.argv[3]
- except IndexError:
- ovdirectory = ""
- s = requests.Session()
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'}
- cookies = dict(ipb_member_id='PUTYOUR THING HERE', ipb_pass_hash='PUT YOUR THING HERE', yay='louder')
- PageID = PageInfo(gallery_url)
- m_gallery = BeautifulSoup(GetGallery(gallery_url).text)
- if PageID == "Overview":
- if ovdirectory == "":
- ovdirectory = "DownloadRun/"
- else:
- ovdirectory += "/DownloadRun/"
- gal_urls = GetGalleryURLs(m_gallery)
- for i in gal_urls:
- DownloadGallery(BeautifulSoup(GetGallery(i).text))
- elif PageID == "Gallery":
- DownloadGallery(m_gallery)
- elif PageID == "Page":
- PageLoop(m_gallery)
RAW Paste Data
