SHARE
TWEET

imcumming

a guest Apr 1st, 2015 240 Never
  1. #!/usr/bin/env python3
  2.  
  3. import os
  4. import requests
  5. import time
  6. import sys
  7. import re
  8. import imghdr
  9. import unicodedata
  10. from bs4 import BeautifulSoup
  11.  
  12. class BadStatus(Exception):
  13.     '''Exception for when http status is not 200. Takes the status as required input.'''
  14.     def __init__(self, status):
  15.         self.status = status
  16.  
  17.     def __str__(self):
  18.         return repr(self.status)
  19.  
  20.  
  21. def GetGallery(gallery_url): #Takes url string
  22.     fail = 0
  23.     while fail < 3:
  24.         try:
  25.             r = s.get(gallery_url, cookies=cookies, headers=headers)
  26.             break
  27.         except requests.exceptions.RequestException:
  28.             fail += 1
  29.             print("Failed to load page (attempt %d / 3). Retrying in 10s..." % fail)
  30.             time.sleep(10)
  31.     if fail == 3:
  32.         sys.exit("Fatal error: Exceeded max retry attempts for page load")
  33.        
  34.     return r
  35.  
  36. def PageInfo(gallery_url): #Takes url string
  37.     if re.match('.*\/s\/.*', gallery_url):
  38.         return "Page"
  39.     elif re.match('.*\/g\/.*', gallery_url):
  40.         return "Gallery"
  41.     else:
  42.         return "Overview"
  43.  
  44.  
  45. def DownloadImage(image_url, image_name): #Takes strings
  46.     '''Downloads an image. Takes a dictionary containing image name as key and URL as value.'''
  47.     global directory
  48.     global ovdirectory
  49.     r = s.get(image_url, cookies=cookies, headers=headers, stream=True)
  50.     size = int(r.headers['content-length'])
  51.     fulldir = ovdirectory + directory
  52.     if r.status_code == 200:
  53.         with open(os.path.join(fulldir, image_name), 'wb') as f:
  54.             for chunk in r.iter_content(1024):
  55.                 f.write(chunk)
  56.     else:
  57.         raise BadStatus(r.status_code)
  58.        
  59.     try:
  60.         if image_name[-3:] != imghdr.what(directory + '/' + image_name):
  61.             os.rename(directory + '/' + image_name, directory + '/' + image_name[:-3] + imghdr.what(directory + '/' + image_name))
  62.     except TypeError:
  63.         print("OH SHIT")
  64.        
  65. def GetImageURL(gallery): #Takes Beautifulsoup object
  66.     fullimg = gallery.findAll('a', href=re.compile('^http://exhentai.org/fullimg.php'))
  67.     image_url = ""
  68.     for links in fullimg:
  69.             fimglink = links.get('href')
  70.     try:
  71.             fimglink
  72.     except NameError:
  73.         image_url = gallery.find(id="img")['src']
  74.         print("Not high res - using H@H")
  75.     else:
  76.         if fimglink != []:
  77.             image_url = fimglink
  78.             print("High res - using ExH servers")
  79.         else:
  80.             image_url = gallery.find(id="img")['src']
  81.             print("Not high res - using H@H")
  82.     return image_url
  83.    
  84.    
  85. def SetDirectory(gallery):
  86.     global directory
  87.     directory = gallery.find('title').string
  88.     if not os.path.isdir(ovdirectory + directory):
  89.         os.makedirs(ovdirectory + directory)
  90.  
  91. def PageLoop(gallery): #Takes Beautifulsoup object
  92.     global stop
  93.     imagenum = 0
  94.     SetDirectory(gallery)
  95.    
  96.     while gallery:
  97.         name = FindName(gallery)
  98.         print(name)
  99.         imgurl = GetImageURL(gallery)
  100.         print(imgurl)
  101.        
  102.         try:
  103.             DownloadImage(imgurl, name)
  104.         except (requests.exceptions.RequestException, BadStatus):
  105.             print("Error: Could not download image. Will try again later.")
  106.             failnames.append(name)
  107.             failurls.append(imgurl)
  108.        
  109.         gallery = NextGallery(gallery)
  110.         imagenum += 1
  111.        
  112.         if imagenum == stop:
  113.             gallery = False
  114.    
  115.     for i in range(0, len(failnames)):
  116.         try:
  117.             DownloadImage(failurls[i], failnames[i])
  118.         except (requests.exceptions.RequestException, BadStatus):
  119.             print("Error: Could not download image. Will try again later.")
  120.             death.append(failnames[i] + " - " + failurls[i])
  121.            
  122.     print(death)
  123.  
  124. def NextGallery(page): #Takes a Beautifulsoup object
  125.     '''Ensures a next page exists and if so returns the URL, else False'''
  126.     new = page.find(id="next")['href']
  127.     newpage = BeautifulSoup(GetGallery(new).text)
  128.     if newpage != page:
  129.         return newpage
  130.     else:
  131.         return False
  132.    
  133.    
  134. def FindName(imagepage): #Takes Beautifulsoup object
  135.     imgname = imagepage.findAll('div', {'id' : 'i4'})[0].find('div').string
  136.     imgname = re.sub('\ ::\ .*', '', imgname)
  137.     return imgname
  138.  
  139.  
  140. def NextOverview(page): #Takes a Beautifulsoup object
  141.     npitem = page.findAll('a', text=re.compile('>'))
  142.     try:
  143.         newpageurl = npitem[0].get('href')
  144.         newpage = BeautifulSoup(GetGallery(newpageurl).text)
  145.         return newpage
  146.     except IndexError:
  147.         return False
  148.  
  149. def GetGalleryURLs(overview):
  150.    
  151.     while overview:
  152.         galinfos = overview.findAll('div', {'class' : 'it5'})
  153.         urls = []
  154.         for i in galinfos:
  155.             a = i.find('a')
  156.             urls.append(a['href'])
  157.         overview = NextOverview(overview)
  158.    
  159.     print(urls)
  160.     return urls
  161.  
  162. def DownloadGallery(gallery):
  163.     first = gallery.find('a', href=re.compile('^http://exhentai.org/s/'))
  164.     furl = first.get('href')
  165.     PageLoop(BeautifulSoup(GetGallery(furl).text))
  166.  
  167.  
  168.  
  169. global directory
  170. global ovdirectory
  171. global stop
  172. failnames = []
  173. failurls = []
  174. death = []
  175. directory = ''
  176. ovdirectory = ''
  177. cmdurl = ""
  178. gallery_url = ''
  179. try:
  180.     gallery_url = sys.argv[1]
  181. except IndexError:
  182.     print("Supply me a URL, come on")
  183.     exit(0)
  184.  
  185. try:
  186.     stop = sys.argv[2]
  187. except IndexError:
  188.     stop = 999999
  189.  
  190. try:
  191.     ovdirectory = sys.argv[3]
  192. except IndexError:
  193.     ovdirectory = ""
  194. s = requests.Session()
  195. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'}
  196. cookies = dict(ipb_member_id='PUTYOUR THING HERE', ipb_pass_hash='PUT YOUR THING HERE', yay='louder')
  197.  
  198. PageID = PageInfo(gallery_url)
  199. m_gallery = BeautifulSoup(GetGallery(gallery_url).text)
  200.  
  201. if PageID == "Overview":
  202.     if ovdirectory == "":
  203.         ovdirectory = "DownloadRun/"
  204.     else:
  205.         ovdirectory += "/DownloadRun/"
  206.     gal_urls = GetGalleryURLs(m_gallery)
  207.     for i in gal_urls:
  208.         DownloadGallery(BeautifulSoup(GetGallery(i).text))
  209. elif PageID == "Gallery":
  210.     DownloadGallery(m_gallery)
  211. elif PageID == "Page":
  212.     PageLoop(m_gallery)
RAW Paste Data
Top