Advertisement
Guest User

Kindly E-Hentai Scraper

a guest
Jun 28th, 2012
2,442
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.40 KB | None | 0 0
  1. '''
  2. Scrapes a gallery from E-Hentai when provided with the URL of the gallery.
  3. Execute with a command like the following at a command prompt:
  4.  
  5. python ehscrape.py -i http://g.e-hentai.org/g/gallery/url/
  6.  
  7. For more information:
  8.  
  9. python ehscrape.py -h
  10.  
  11. Requires you to have BeautifulSoup installed. Google it.
  12. '''
  13.  
  14. from BeautifulSoup import BeautifulStoneSoup
  15. import time
  16. import urllib2
  17. import os
  18. import os.path
  19. import re
  20. import argparse
  21. import sys
  22.  
  23. def main():
  24.     '''Scrapes a gallery from its front page'''
  25.     parser = argparse.ArgumentParser(description='ehscrape parser')
  26.     parser.add_argument('-o', '--output', action='store', default=False,
  27.                         help='Set an output directory manually')
  28.     parser.add_argument('-i', '--input', action='store',
  29.                         help='Provide a link to the gallery\'s page')
  30.     parser.add_argument('-r', '--recover', action='store', default=0,
  31.                         help='Provide an image number to start from')
  32.     parser.add_argument('-u', '--update', action='store_true', default=False,
  33.                         help="Useful for updating galleries, ignores files \
  34.                        whose names are already present.")
  35.     args = parser.parse_args()
  36.     fpr = urllib2.Request(args.input + '?nw=session', headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/17.0.963.79 Chrome/17.0.963.79 Safari/535.11'})
  37.     fp = urllib2.urlopen(fpr)
  38.     print(fp.geturl())
  39.     with open('temp', 'w') as temp:
  40.         temp.write(fp.read())
  41.     with open('temp', 'r') as temp:
  42.         soup = BeautifulStoneSoup(temp)
  43.     os.remove('temp')
  44.     title = soup.find(name='h1', attrs={'id': 'gn'}).string
  45.     print(title)
  46.     myre = re.compile('Showing\s\d\s-\s\d+\sof\s(?P<number>\d+)\simages')
  47.     imgno = int(myre.match(soup.find(text=myre)).group('number'))
  48.     d, r = imgno / 20, imgno % 20
  49.     if r:
  50.         pageno = d + 1
  51.     else:
  52.         pageno = d
  53.     if args.output:
  54.         outd = args.output
  55.     else:
  56.         outd = u''.join(title.split('/'))
  57.     if os.path.isdir(outd):
  58.         print(u'The directory {0} already exists.'.format(outd))
  59.         r = raw_input(u'Continue? [y/n]')
  60.         if r in ['y', 'Y', '']:
  61.             curfiles = os.listdir(outd)
  62.         else:
  63.             print('ehscrape aborting.')
  64.             sys.exit()
  65.     else:
  66.         os.mkdir(outd)
  67.     img_pages = []
  68.     for i in range(pageno):
  69.         time.sleep(1)
  70.         page = urllib2.urlopen('{0}?p={1}'.format(args.input, str(i)))
  71.         dir_temp = os.path.join(outd, 'temp')
  72.         with open(dir_temp,'w') as temp:
  73.             temp.write(page.read())
  74.         with open(dir_temp,'r') as temp:
  75.             soup = BeautifulStoneSoup(temp)
  76.         os.remove(dir_temp)
  77.         are = re.compile('http://g.e-hentai.org/s/\S{10}/\d{6}-\d+')
  78.         for a in soup.findAll(name='a',attrs={'href': are}):
  79.             img_pages.append(a['href'])
  80.     if args.recover:
  81.         args.recover = int(args.recover) - 1
  82.     for ip in img_pages[int(args.recover):]:
  83.         time.sleep(4)
  84.         print(ip)
  85.         page = urllib2.urlopen(ip)
  86.         dir_temp = os.path.join(outd, 'temp')
  87.         with open(dir_temp,'w') as temp:
  88.             temp.write(page.read())
  89.         with open(dir_temp,'r') as temp:
  90.             soup = BeautifulStoneSoup(temp)
  91.         os.remove(dir_temp)
  92.         stre = re.compile(r'(height:\d+px|width:\d+px);(height:\d+px|width:\d+px)')
  93.         srre = re.compile('http://\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:?\d{1,5}?(/h|/ehg)/\S*(/keystamp)?=\S*')
  94.         imgs = soup.findAll(name='img', attrs={'style':stre, 'src': srre})
  95.         for each in imgs:
  96.             source = urllib2.urlopen(each['src'])
  97.             if 'keystamp' in each['src']:
  98.                 filename = each['src'].split('/')[-1]
  99.             elif 'image.php' in each['src']:
  100.                 filename = each['src'].split('&n=')[-1]
  101.         if args.update:
  102.             if filename in curfiles:
  103.                 print('{0} already present'.format(filename))
  104.             else:
  105.                 print(filename)
  106.                 with open(os.path.join(outd, filename), 'wb') as image:
  107.                     image.write(source.read())
  108.         else:
  109.             print(filename)
  110.             with open(os.path.join(outd, filename), 'wb') as image:
  111.                 image.write(source.read())
  112.        
  113. if __name__ == '__main__':
  114.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement