Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import os
- import time
- from rich.progress import track
- import argparse
- import hashlib
- import random
- parser = argparse.ArgumentParser()
- parser.add_argument("-o", "--outputfolder", type=str, default="./output", help="Path to save files")
- parser.add_argument("-t", "--tag", type=str, required=True, help="Tags to download")
- parser.add_argument("-s", "--sleeptime", type=float, default=0.5, help="Amount of seconds to sleep between downloads")
- parser.add_argument("-p", "--startingpage", type=int, default=1, help="Page to start search from")
- parser.add_argument("-c", "--count", type=int, default=1000, help="How many random images you need to get, default 1000.")
- parser.add_argument("--maxtimeout", type=int, default=1024, help="Timeout in seconds before requester stops trying to download links or images")
- parser.add_argument("--debug", action="store_true", help="Flag to enable debug mode.")
- parser.add_argument("-r", "--proxy", type=str, default="", help="Socks5 proxy in notation user:pass@ip:port (requires pip install requests[socks])")
- args = parser.parse_args()
- def sendRequest(url):
- go = False
- timeout = 2
- requesttimeout = 12
- while not go:
- try:
- if args.debug:
- print(f'<== {timeout} === {url} @ {requesttimeout}')
- if args.proxy:
- r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0"}, timeout=requesttimeout, proxies={'http': f'socks5://{args.proxy}', 'https': f'socks5://{args.proxy}'})
- else:
- r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0"}, timeout=requesttimeout)
- if args.debug:
- print(f"=> CODE {r.status_code}")
- go = True
- except Exception as E:
- requesttimeout += 3
- if timeout <= args.maxtimeout:
- if args.debug or timeout >= 32:
- print(f"<= {url}")
- print(f"REASON: {E}")
- print(f"Waiting {timeout} seconds + increasing request timeout to {requesttimeout}...")
- time.sleep(timeout)
- timeout *= 2
- else:
- print(f"<= {url}")
- print(f"REASON: {E}")
- print("Exiting with code 1.")
- exit(1)
- if r.status_code != 200:
- print(f"<= {url}")
- print(f"REASON: Status code {r.status_code}")
- print("Exiting with code 1.")
- exit(1)
- return r
- def calculateMD5(filepath):
- with open(filepath, 'rb') as f:
- return hashlib.md5(f.read()).hexdigest()
- def downloadAs(url, filepath):
- time.sleep(args.sleeptime)
- file_contents = sendRequest(url).content
- with open(filepath, 'wb') as f:
- f.write(file_contents)
- def load_json(tags):
- tags_readable = tags.replace(' ', '%20')
- page = args.startingpage
- db_full = {"posts": []}
- db_url = f'https://e621.net/posts.json?limit=300&page={page}&tags={tags_readable}'
- db_url_data = sendRequest(db_url).json()
- while db_url_data["posts"]:
- db_full["posts"] += db_url_data["posts"]
- page += 1
- time.sleep(args.sleeptime)
- db_url = f'https://e621.net/posts.json?limit=300&page={page}&tags={tags_readable}'
- db_url_data = sendRequest(db_url).json()
- print("Found %i images." % (len(db_full["posts"]),))
- random.shuffle(db_full["posts"])
- return db_full
- def downloadImageAndTags(json_point):
- postId = json_point['id']
- imageExt = json_point['file']['ext']
- imageUrl = json_point['file']['url']
- imageMD5 = json_point["file"]["md5"]
- if not imageUrl:
- md_12 = imageMD5[:2]
- md_34 = imageMD5[2:4]
- imageUrl = f'https://static1.e621.net/data/{md_12}/{md_34}/{imageMD5}.{imageExt}'
- try:
- os.mkdir(f'{args.outputfolder}/{postId}')
- except:
- pass
- imagePhysicalPath = f'{args.outputfolder}/{postId}.{imageExt}'
- if not os.path.exists(imagePhysicalPath):
- # If file not exist, then download
- downloadAs(imageUrl, imagePhysicalPath)
- elif calculateMD5(imagePhysicalPath) != imageMD5:
- # If file do exist, then compare md5 checksum, and if it is not same, then redownload
- downloadAs(imageUrl, imagePhysicalPath)
- # Always remake tags file. Just for convinience.
- tagsPhysicalPath = f'{args.outputfolder}/{postId}/tags.txt'
- tags = []
- if json_point['tags']['general']:
- tags += json_point['tags']['general']
- if json_point['tags']['species']:
- tags += json_point['tags']['species']
- if json_point['tags']['character']:
- tags += json_point['tags']['character']
- if json_point['tags']['artist']:
- tags += json_point['tags']['artist']
- tags += ["rating:" + json_point['rating']]
- with open(tagsPhysicalPath, 'w', encoding='utf-8') as f:
- f.write(',\n'.join(tags))
- try:
- os.mkdir(args.outputfolder)
- except:
- pass
- c = 0
- for post in track(load_json(args.tag)['posts'], description=f"Downloading posts from e621 tagged as '{args.tag}'..."):
- try:
- downloadImageAndTags(post)
- c += 1
- if c > args.count:
- print(f"{args.count} posts downloaded, shutting down!")
- break
- except:
- print(f"{post['id']} failed catastrophically!")
Advertisement
Add Comment
Please, Sign In to add comment