Guest User

e6collector.py version 2.1

a guest
Mar 8th, 2020
622
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.48 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. """
  5. e6collector
  6.  
  7. Very simple Python 3.6 script that will download all images with a specific
  8. tag, save them named ID-SOME-TAGS.EXTENSION and makes sure it doesn't
  9. download the same image multiple times because the tags changed. Also
  10. all tags will be written to tags.csv in the same folder. Using the
  11. e621 2020 API described on https://e621.net/help/api
  12.  
  13. PROTIP: To view images with a certain tag try this on UNIX-like systems:
  14. gthumb `for f in $(grep -i TAG tags.csv | cut -f 1 -d ","); do echo $f-*; done`
  15.  
  16. THE AUTHOR DOES NOT TAKE ANY RESPONSIBILITIES, THERE IS NO WARRANTY AND
  17. YOU PROBABLY SHOULDN'T USE THIS IN A NUCLEAR POWER PLANT, JUST SAYING!
  18.  
  19. License: Public domain, do whatever.
  20.  
  21. Version 1.0 -- Initial release, if you can call it that
  22. Version 1.0.1 -- Fixed Unicode problem on Windows
  23. Version 1.0.2 -- Fixed API not working
  24. Version 2.0 -- Ported to new 2020 API, added rate limiting
  25. Version 2.1 -- Handle restricted images by allowing API key usage or skipping
  26. """
  27. import argparse
  28. import base64
  29. import csv
  30. import glob
  31. import json
  32. import time
  33. import os.path
  34. from collections import namedtuple
  35. from urllib.request import urlopen, Request
  36.  
  37. BASEURL = 'https://e621.net'
  38. LISTURL = BASEURL + '/posts.json?limit=320&page={0:d}&tags={1:s}'
  39. FNAME = '{0:d}-{1:s}.{2:s}'
  40. KEEPCHARS = ['_', '-']  # For tag mangling
  41. destination = None
  42.  
  43. lastrequest = 0 # UNIX time stamp of last request
  44. hdr = {'User-Agent': 'e6collector/2.1 (by stealthmode)'}
  45. Post = namedtuple("Post", ['id', 'deleted', 'tags', 'ext', 'url',
  46.                            'sources'])
  47.  
  48. # Simple delay for adhering to 1 per second rate limit outlined in API doc
  49. def speedlimit():
  50.     global lastrequest
  51.     while lastrequest + 1 > time.time():
  52.         print('Zzzz')
  53.         time.sleep(0.2)
  54.     lastrequest = time.time()
  55.  
  56. # Add HTTP basic auth headers manually. does not work via urllib handler
  57. # because the server never asks since it's optional
  58. def login(user, key):
  59.     global hdr
  60.     if user is not None and key is not None:
  61.         print('Using API key')
  62.         credentials = '{0:s}:{1:s}'.format(user, key)
  63.         b64credentials = base64.b64encode(str.encode(credentials)).decode()
  64.         hdr['Authorization'] = 'Basic {0:s}'.format(b64credentials)
  65.  
  66. def readlist(query):
  67.     page = 1 # Starting to count from 1, how strange
  68.     posts = []
  69.    
  70.     while True:  # Ick
  71.         # Some error handling would be nice, this will bug out if e6 is slow.
  72.         print('Now fetching page {}'.format(page))
  73.         with urlopen(Request(LISTURL.format(page, query), headers=hdr)) as req:
  74.             jsondata = json.load(req)
  75.             if not 'posts' in jsondata:
  76.                 raise KeyError('posts array not found in JSON data')
  77.                 exit(1)
  78.             if len(jsondata['posts']) == 0:
  79.                 print('Done reading list')
  80.                 break
  81.             for post in jsondata['posts']:
  82.                 # Flatten post tags into single set
  83.                 tags = set()
  84.                 try:
  85.                     for category in post['tags']:
  86.                         if not isinstance(post['tags'][category], list):
  87.                             continue
  88.                         tags = tags.union(post['tags'][category])
  89.                 except KeyError as e:
  90.                     raise # XXX
  91.                 tags = sorted(tags)
  92.  
  93.                 try:
  94.                     posts.append(Post(post['id'],
  95.                          post['flags']['deleted'],
  96.                          tags,
  97.                          post['file']['ext'],
  98.                          post['file']['url'],
  99.                          post['sources']))
  100.                 except KeyError as e:
  101.                     raise # XXX
  102.  
  103.             page += 1
  104.             speedlimit()
  105.     return(posts)
  106.  
  107. def writetags(post_id, source, tags):
  108.     if source is None:
  109.         source = ""
  110.     fullpath = os.path.join(destination, 'tags.csv')
  111.     with open(fullpath, 'a', encoding='utf-8') as tagfile:
  112.         writer = csv.writer(tagfile, quoting=csv.QUOTE_MINIMAL)
  113.         row = [post_id, source]
  114.         row.extend(tags)
  115.         writer.writerow(row)
  116.  
  117. def mirror(tags):
  118.     posts = readlist(tags)
  119.     downloaded = 0
  120.     for post in posts:
  121.         if post.deleted:
  122.             print('{0:d} was deleted'.format(post.id))
  123.             continue
  124.         if len(glob.glob(os.path.join(destination, '{0:d}-*'.format(post.id)))) > 0:
  125.             print('{0:d} already exists'.format(post.id))
  126.             continue
  127.         if post.url is None:
  128.             print('{0:d} could not be downloaded, you might have to log in'.format(post.id))
  129.             continue
  130.         speedlimit()
  131.         with urlopen(Request(post.url, headers=hdr)) as request:
  132.             downloaded += 1
  133.             print('{0:d} is being downloaded and saved...'.format(post.id))
  134.             imgdata = request.read()
  135.            
  136.             # Make tags safe for file name usage. Still unelegant as fuck.
  137.             fname_tags = []
  138.             for tag in post.tags:
  139.                 mangled_tag = ''
  140.                 for chara in tag:
  141.                     if chara.isalnum() or chara in KEEPCHARS:
  142.                         mangled_tag += chara
  143.                     else:
  144.                         print('Bad char?', ord(chara))
  145.                 if len(mangled_tag) > 0:
  146.                     fname_tags.append(mangled_tag)
  147.  
  148.             fname = FNAME.format(post.id, '-'.join(fname_tags)[:190], post.ext)
  149.             fullpath = os.path.join(destination, fname)
  150.             with open(fullpath.encode('utf-8'), 'wb') as imgfile:
  151.                 imgfile.write(imgdata)
  152.                 writetags(post.id, post.sources, post.tags)
  153.  
  154.     print('Finished collecting {0:d} posts'.format(len(posts)))
  155.     print('{0:d} were newly downloaded'.format(downloaded))
  156.  
  157. if __name__ == '__main__':
  158.     parser = argparse.ArgumentParser(description='Download files by tag from e621')
  159.     parser.add_argument('destination', help='Directory to store the files in', nargs='?')
  160.     parser.add_argument('tags', help='Tags to look for. Must be URL encoded already. Try "fav:yourname"', nargs='?')
  161.     parser.add_argument('user', help='User account to use with API key', default=None, nargs='?')
  162.     parser.add_argument('key', help='API key, needed for some downloads', default=None, nargs='?')
  163.     args = parser.parse_args()
  164.     if args.tags is None:
  165.         parser.print_help()
  166.         exit (1)
  167.     destination = args.destination
  168.     login(args.user, args.key)
  169.     mirror(args.tags)
Add Comment
Please, Sign In to add comment