e621.net archive script

import csv
import sys
import os
import requests
import re
from datetime import datetime
import time

csv.field_size_limit(sys.maxsize)

MAX_REQUESTS_PER_SECOND = 20
REQUEST_INTERVAL = 1.0 / MAX_REQUESTS_PER_SECOND

def format_date(date_str):
    return datetime.strptime(date_str.split()[0], '%Y-%m-%d')

def construct_image_url(md5, file_ext):
    return f"https://static1.e621.net/data/{md5[:2]}/{md5[2:4]}/{md5}.{file_ext}"

def download_image(url, session, last_request_time):
    current_time = time.time()
    if current_time - last_request_time < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - (current_time - last_request_time))

    response = session.get(url)
    if response.status_code == 200:
        return response.content, time.time()
    else:
        print(f"Download request failed with status code: {response.status_code}, response: {response.text}")
    return None, time.time()

def save_image(image_data, filepath):
    with open(filepath, 'wb') as file:
        file.write(image_data)

def main(wildcard_tag, csv_file, parent_dir):
    print("Starting script...")

    parent_dir_path = os.path.join(parent_dir, wildcard_tag)
    if not os.path.exists(parent_dir_path):
        os.makedirs(parent_dir_path)

    session = requests.Session()
    session.headers.update({'User-Agent': 'tag2dir/1.0'})

    tag_pattern = re.compile(wildcard_tag.replace('*', '.*'))

    print(f"Processing tag: {wildcard_tag}")
    print("Reading and processing CSV file...")

    last_request_time = time.time()

    with open(csv_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tags = row['tag_string'].split()
            if any(tag_pattern.match(tag) for tag in tags) and row['is_deleted'] == 'f':
                formatted_date = format_date(row['created_at'])
                year = formatted_date.strftime('%Y')
                month = formatted_date.strftime('%m')
                dir_path = os.path.join(parent_dir_path, year, month)
                if not os.path.exists(dir_path):
                    os.makedirs(dir_path)
                filename = f"{row['md5']}.{row['file_ext']}"
                file_path = os.path.join(dir_path, filename)
                if os.path.exists(file_path):
                    print(f"File already exists, skipping: {filename}")
                    continue
                print(f"Downloading file: {filename}...")
                image_url = construct_image_url(row['md5'], row['file_ext'])
                image_data, last_request_time = download_image(image_url, session, last_request_time)
                if image_data:
                    save_image(image_data, file_path)
                    print(f"Downloaded and saved file: {filename}")
                else:
                    print(f"Failed to download file: {filename}")

    print("Script completed successfully.")

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python tag2dir.py wildcard_tag parent_directory")
        sys.exit(1)

    wildcard_tag = sys.argv[1]
    parent_directory = sys.argv[2]
    csv_file = "posts-2024-04-03.csv"

    main(wildcard_tag, csv_file, parent_directory)