Advertisement
Guest User

e621.net archive script

a guest
Apr 7th, 2024
240
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.23 KB | None | 0 0
  1. import csv
  2. import sys
  3. import os
  4. import requests
  5. import re
  6. from datetime import datetime
  7. import time
  8.  
  9. csv.field_size_limit(sys.maxsize)
  10.  
  11. MAX_REQUESTS_PER_SECOND = 20
  12. REQUEST_INTERVAL = 1.0 / MAX_REQUESTS_PER_SECOND
  13.  
  14. def format_date(date_str):
  15.     return datetime.strptime(date_str.split()[0], '%Y-%m-%d')
  16.  
  17. def construct_image_url(md5, file_ext):
  18.     return f"https://static1.e621.net/data/{md5[:2]}/{md5[2:4]}/{md5}.{file_ext}"
  19.  
  20. def download_image(url, session, last_request_time):
  21.     current_time = time.time()
  22.     if current_time - last_request_time < REQUEST_INTERVAL:
  23.         time.sleep(REQUEST_INTERVAL - (current_time - last_request_time))
  24.  
  25.     response = session.get(url)
  26.     if response.status_code == 200:
  27.         return response.content, time.time()
  28.     else:
  29.         print(f"Download request failed with status code: {response.status_code}, response: {response.text}")
  30.     return None, time.time()
  31.  
  32. def save_image(image_data, filepath):
  33.     with open(filepath, 'wb') as file:
  34.         file.write(image_data)
  35.  
  36. def main(wildcard_tag, csv_file, parent_dir):
  37.     print("Starting script...")
  38.  
  39.     parent_dir_path = os.path.join(parent_dir, wildcard_tag)
  40.     if not os.path.exists(parent_dir_path):
  41.         os.makedirs(parent_dir_path)
  42.  
  43.     session = requests.Session()
  44.     session.headers.update({'User-Agent': 'tag2dir/1.0'})
  45.  
  46.     tag_pattern = re.compile(wildcard_tag.replace('*', '.*'))
  47.  
  48.     print(f"Processing tag: {wildcard_tag}")
  49.     print("Reading and processing CSV file...")
  50.  
  51.     last_request_time = time.time()
  52.  
  53.     with open(csv_file, newline='', encoding='utf-8') as csvfile:
  54.         reader = csv.DictReader(csvfile)
  55.         for row in reader:
  56.             tags = row['tag_string'].split()
  57.             if any(tag_pattern.match(tag) for tag in tags) and row['is_deleted'] == 'f':
  58.                 formatted_date = format_date(row['created_at'])
  59.                 year = formatted_date.strftime('%Y')
  60.                 month = formatted_date.strftime('%m')
  61.                 dir_path = os.path.join(parent_dir_path, year, month)
  62.                 if not os.path.exists(dir_path):
  63.                     os.makedirs(dir_path)
  64.                 filename = f"{row['md5']}.{row['file_ext']}"
  65.                 file_path = os.path.join(dir_path, filename)
  66.                 if os.path.exists(file_path):
  67.                     print(f"File already exists, skipping: {filename}")
  68.                     continue
  69.                 print(f"Downloading file: {filename}...")
  70.                 image_url = construct_image_url(row['md5'], row['file_ext'])
  71.                 image_data, last_request_time = download_image(image_url, session, last_request_time)
  72.                 if image_data:
  73.                     save_image(image_data, file_path)
  74.                     print(f"Downloaded and saved file: {filename}")
  75.                 else:
  76.                     print(f"Failed to download file: {filename}")
  77.  
  78.     print("Script completed successfully.")
  79.  
  80. if __name__ == "__main__":
  81.     if len(sys.argv) != 3:
  82.         print("Usage: python tag2dir.py wildcard_tag parent_directory")
  83.         sys.exit(1)
  84.  
  85.     wildcard_tag = sys.argv[1]
  86.     parent_directory = sys.argv[2]
  87.     csv_file = "posts-2024-04-03.csv"
  88.  
  89.     main(wildcard_tag, csv_file, parent_directory)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement