edownloader

import requests
import os
import time
from rich.progress import track
import argparse
import hashlib
import random

parser = argparse.ArgumentParser()
parser.add_argument("-o", "--outputfolder", type=str, default="./output", help="Path to save files")
parser.add_argument("-t", "--tag", type=str, required=True, help="Tags to download")
parser.add_argument("-s", "--sleeptime", type=float, default=0.5, help="Amount of seconds to sleep between downloads")
parser.add_argument("-p", "--startingpage", type=int, default=1, help="Page to start search from")
parser.add_argument("-c", "--count", type=int, default=1000, help="How many random images you need to get, default 1000.")
parser.add_argument("--maxtimeout", type=int, default=1024, help="Timeout in seconds before requester stops trying to download links or images")
parser.add_argument("--debug", action="store_true", help="Flag to enable debug mode.")
parser.add_argument("-r", "--proxy", type=str, default="", help="Socks5 proxy in notation user:pass@ip:port (requires pip install requests[socks])")
args = parser.parse_args()

def sendRequest(url):
    go = False
    timeout = 2
    requesttimeout = 12
    while not go:
        try:
            if args.debug:
                print(f'<== {timeout} === {url}   @   {requesttimeout}')

            if args.proxy:
                r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0"}, timeout=requesttimeout, proxies={'http': f'socks5://{args.proxy}', 'https': f'socks5://{args.proxy}'})
            else:
                r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0"}, timeout=requesttimeout)

            if args.debug:
                print(f"=> CODE {r.status_code}")
            go = True
        except Exception as E:
            requesttimeout += 3
            if timeout <= args.maxtimeout:
                if args.debug or timeout >= 32:
                    print(f"<= {url}")
                    print(f"REASON: {E}")
                    print(f"Waiting {timeout} seconds + increasing request timeout to {requesttimeout}...")
                time.sleep(timeout)
                timeout *= 2
            else:
                print(f"<= {url}")
                print(f"REASON: {E}")
                print("Exiting with code 1.")
                exit(1)
    if r.status_code != 200:
        print(f"<= {url}")
        print(f"REASON: Status code {r.status_code}")
        print("Exiting with code 1.")
        exit(1)
    return r

def calculateMD5(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def downloadAs(url, filepath):
    time.sleep(args.sleeptime)
    file_contents = sendRequest(url).content
    with open(filepath, 'wb') as f:
        f.write(file_contents)

def load_json(tags):
    tags_readable = tags.replace(' ', '%20')
    page = args.startingpage
    db_full = {"posts": []}
    db_url = f'https://e621.net/posts.json?limit=300&page={page}&tags={tags_readable}'
    db_url_data = sendRequest(db_url).json()
    while db_url_data["posts"]:
        db_full["posts"] += db_url_data["posts"]
        page += 1
        time.sleep(args.sleeptime)
        db_url = f'https://e621.net/posts.json?limit=300&page={page}&tags={tags_readable}'
        db_url_data = sendRequest(db_url).json()
    print("Found %i images." % (len(db_full["posts"]),))
    random.shuffle(db_full["posts"])
    return db_full

def downloadImageAndTags(json_point):
    postId = json_point['id']
    imageExt = json_point['file']['ext']
    imageUrl = json_point['file']['url']
    imageMD5 = json_point["file"]["md5"]
    if not imageUrl:
        md_12 = imageMD5[:2]
        md_34 = imageMD5[2:4]
        imageUrl = f'https://static1.e621.net/data/{md_12}/{md_34}/{imageMD5}.{imageExt}'

    try:
        os.mkdir(f'{args.outputfolder}/{postId}')
    except:
        pass

    imagePhysicalPath = f'{args.outputfolder}/{postId}.{imageExt}'
    if not os.path.exists(imagePhysicalPath):
        # If file not exist, then download
        downloadAs(imageUrl, imagePhysicalPath)
    elif calculateMD5(imagePhysicalPath) != imageMD5:
        # If file do exist, then compare md5 checksum, and if it is not same, then redownload
        downloadAs(imageUrl, imagePhysicalPath)

    # Always remake tags file. Just for convinience.
    tagsPhysicalPath = f'{args.outputfolder}/{postId}/tags.txt'
    tags = []
    if json_point['tags']['general']:
        tags += json_point['tags']['general']
    if json_point['tags']['species']:
        tags += json_point['tags']['species']
    if json_point['tags']['character']:
        tags += json_point['tags']['character']
    if json_point['tags']['artist']:
        tags += json_point['tags']['artist']
    tags += ["rating:" + json_point['rating']]
    with open(tagsPhysicalPath, 'w', encoding='utf-8') as f:
        f.write(',\n'.join(tags))

try:
    os.mkdir(args.outputfolder)
except:
    pass

c = 0
for post in track(load_json(args.tag)['posts'], description=f"Downloading posts from e621 tagged as '{args.tag}'..."):
    try:
        downloadImageAndTags(post)
        c += 1
        if c > args.count:
            print(f"{args.count} posts downloaded, shutting down!")
            break
    except:
        print(f"{post['id']} failed catastrophically!")