Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import sys
- import os
- import requests
- import re
- from datetime import datetime
- import time
- csv.field_size_limit(sys.maxsize)
- MAX_REQUESTS_PER_SECOND = 20
- REQUEST_INTERVAL = 1.0 / MAX_REQUESTS_PER_SECOND
- def format_date(date_str):
- return datetime.strptime(date_str.split()[0], '%Y-%m-%d')
- def construct_image_url(md5, file_ext):
- return f"https://static1.e621.net/data/{md5[:2]}/{md5[2:4]}/{md5}.{file_ext}"
- def download_image(url, session, last_request_time):
- current_time = time.time()
- if current_time - last_request_time < REQUEST_INTERVAL:
- time.sleep(REQUEST_INTERVAL - (current_time - last_request_time))
- response = session.get(url)
- if response.status_code == 200:
- return response.content, time.time()
- else:
- print(f"Download request failed with status code: {response.status_code}, response: {response.text}")
- return None, time.time()
- def save_image(image_data, filepath):
- with open(filepath, 'wb') as file:
- file.write(image_data)
- def main(wildcard_tag, csv_file, parent_dir):
- print("Starting script...")
- parent_dir_path = os.path.join(parent_dir, wildcard_tag)
- if not os.path.exists(parent_dir_path):
- os.makedirs(parent_dir_path)
- session = requests.Session()
- session.headers.update({'User-Agent': 'tag2dir/1.0'})
- tag_pattern = re.compile(wildcard_tag.replace('*', '.*'))
- print(f"Processing tag: {wildcard_tag}")
- print("Reading and processing CSV file...")
- last_request_time = time.time()
- with open(csv_file, newline='', encoding='utf-8') as csvfile:
- reader = csv.DictReader(csvfile)
- for row in reader:
- tags = row['tag_string'].split()
- if any(tag_pattern.match(tag) for tag in tags) and row['is_deleted'] == 'f':
- formatted_date = format_date(row['created_at'])
- year = formatted_date.strftime('%Y')
- month = formatted_date.strftime('%m')
- dir_path = os.path.join(parent_dir_path, year, month)
- if not os.path.exists(dir_path):
- os.makedirs(dir_path)
- filename = f"{row['md5']}.{row['file_ext']}"
- file_path = os.path.join(dir_path, filename)
- if os.path.exists(file_path):
- print(f"File already exists, skipping: {filename}")
- continue
- print(f"Downloading file: {filename}...")
- image_url = construct_image_url(row['md5'], row['file_ext'])
- image_data, last_request_time = download_image(image_url, session, last_request_time)
- if image_data:
- save_image(image_data, file_path)
- print(f"Downloaded and saved file: {filename}")
- else:
- print(f"Failed to download file: {filename}")
- print("Script completed successfully.")
- if __name__ == "__main__":
- if len(sys.argv) != 3:
- print("Usage: python tag2dir.py wildcard_tag parent_directory")
- sys.exit(1)
- wildcard_tag = sys.argv[1]
- parent_directory = sys.argv[2]
- csv_file = "posts-2024-04-03.csv"
- main(wildcard_tag, csv_file, parent_directory)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement