Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''
- Scrapes a gallery from E-Hentai when provided with the URL of the gallery.
- Execute with a command like the following at a command prompt:
- python ehscrape.py -i http://g.e-hentai.org/g/gallery/url/
- For more information:
- python ehscrape.py -h
- Requires you to have BeautifulSoup installed. Google it.
- '''
- from BeautifulSoup import BeautifulStoneSoup
- import time
- import urllib2
- import os
- import os.path
- import re
- import argparse
- import sys
- def main():
- '''Scrapes a gallery from its front page'''
- parser = argparse.ArgumentParser(description='ehscrape parser')
- parser.add_argument('-o', '--output', action='store', default=False,
- help='Set an output directory manually')
- parser.add_argument('-i', '--input', action='store',
- help='Provide a link to the gallery\'s page')
- parser.add_argument('-r', '--recover', action='store', default=0,
- help='Provide an image number to start from')
- parser.add_argument('-u', '--update', action='store_true', default=False,
- help="Useful for updating galleries, ignores files \
- whose names are already present.")
- args = parser.parse_args()
- fpr = urllib2.Request(args.input + '?nw=session', headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/17.0.963.79 Chrome/17.0.963.79 Safari/535.11'})
- fp = urllib2.urlopen(fpr)
- print(fp.geturl())
- with open('temp', 'w') as temp:
- temp.write(fp.read())
- with open('temp', 'r') as temp:
- soup = BeautifulStoneSoup(temp)
- os.remove('temp')
- title = soup.find(name='h1', attrs={'id': 'gn'}).string
- print(title)
- myre = re.compile('Showing\s\d\s-\s\d+\sof\s(?P<number>\d+)\simages')
- imgno = int(myre.match(soup.find(text=myre)).group('number'))
- d, r = imgno / 20, imgno % 20
- if r:
- pageno = d + 1
- else:
- pageno = d
- if args.output:
- outd = args.output
- else:
- outd = u''.join(title.split('/'))
- if os.path.isdir(outd):
- print(u'The directory {0} already exists.'.format(outd))
- r = raw_input(u'Continue? [y/n]')
- if r in ['y', 'Y', '']:
- curfiles = os.listdir(outd)
- else:
- print('ehscrape aborting.')
- sys.exit()
- else:
- os.mkdir(outd)
- img_pages = []
- for i in range(pageno):
- time.sleep(1)
- page = urllib2.urlopen('{0}?p={1}'.format(args.input, str(i)))
- dir_temp = os.path.join(outd, 'temp')
- with open(dir_temp,'w') as temp:
- temp.write(page.read())
- with open(dir_temp,'r') as temp:
- soup = BeautifulStoneSoup(temp)
- os.remove(dir_temp)
- are = re.compile('http://g.e-hentai.org/s/\S{10}/\d{6}-\d+')
- for a in soup.findAll(name='a',attrs={'href': are}):
- img_pages.append(a['href'])
- if args.recover:
- args.recover = int(args.recover) - 1
- for ip in img_pages[int(args.recover):]:
- time.sleep(4)
- print(ip)
- page = urllib2.urlopen(ip)
- dir_temp = os.path.join(outd, 'temp')
- with open(dir_temp,'w') as temp:
- temp.write(page.read())
- with open(dir_temp,'r') as temp:
- soup = BeautifulStoneSoup(temp)
- os.remove(dir_temp)
- stre = re.compile(r'(height:\d+px|width:\d+px);(height:\d+px|width:\d+px)')
- srre = re.compile('http://\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:?\d{1,5}?(/h|/ehg)/\S*(/keystamp)?=\S*')
- imgs = soup.findAll(name='img', attrs={'style':stre, 'src': srre})
- for each in imgs:
- source = urllib2.urlopen(each['src'])
- if 'keystamp' in each['src']:
- filename = each['src'].split('/')[-1]
- elif 'image.php' in each['src']:
- filename = each['src'].split('&n=')[-1]
- if args.update:
- if filename in curfiles:
- print('{0} already present'.format(filename))
- else:
- print(filename)
- with open(os.path.join(outd, filename), 'wb') as image:
- image.write(source.read())
- else:
- print(filename)
- with open(os.path.join(outd, filename), 'wb') as image:
- image.write(source.read())
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement