Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # colorama==0.4.6 numpy==1.26.1 piexif==1.1.3 Pillow==10.1.0 python-dateutil==2.8.2 pytz==2023.3.post1 six==1.16.0 tqdm==4.66.1 tzdata==2023.3 ...
- import os, json, urllib.request, piexif, shutil, glob, imghdr, json, hashlib, time, socket, tqdm
- from urllib.parse import urlparse
- from html.parser import HTMLParser
- from PIL import Image, UnidentifiedImageError
- class ImgParser(HTMLParser):
- def __init__(self):
- super().__init__()
- self.img_urls = []
- def handle_starttag(self, tag, attrs):
- if tag == 'img':
- for attr in attrs:
- if attr[0] == 'src':
- self.img_urls.append(attr[1])
- def parse_html(file_path):
- parser = ImgParser()
- try:
- with open(file_path, 'r', encoding='utf-8') as f:
- parser.feed(f.read())
- except PermissionError:
- print(f"Permission denied: {file_path}")
- return []
- return parser.img_urls
- def write_to_txt(file_path, img_urls):
- with open(f'{file_path}.txt', 'w') as f:
- for url in img_urls:
- f.write(f'{url}\n')
- def create_dir(folder_path):
- if not os.path.exists(folder_path):
- os.makedirs(folder_path)
- def read_info_from_image(image_path):
- try:
- image = Image.open(image_path)
- except UnidentifiedImageError:
- print(f"Cannot identify image file {image_path}")
- return None, None
- items = image.info or {}
- geninfo = items.pop('parameters', None)
- if "exif" in items:
- exif = piexif.load(items["exif"])
- exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b'')
- try:
- exif_comment = exif_comment.decode(errors="ignore")
- except ValueError:
- exif_comment = exif_comment.decode('utf8', errors="ignore")
- if exif_comment:
- items['exif comment'] = exif_comment
- geninfo = exif_comment
- for field in ['jfif', 'jfif_version', 'jfif_unit', 'jfif_density', 'dpi', 'exif',
- 'loop', 'background', 'timestamp', 'duration', 'progressive', 'progression',
- 'icc_profile', 'chromaticity']:
- items.pop(field, None)
- if items.get("Software", None) == "NovelAI":
- try:
- json_info = json.loads(items["Comment"])
- sampler = sd_samplers.samplers_map.get(json_info["sampler"], "Euler a")
- geninfo = f"""{items["Description"]}
- Negative prompt: {json_info["uc"]}
- Steps: {json_info["steps"]}, Sampler: {sampler}, CFG scale: {json_info["scale"]}, Seed: {json_info["seed"]}, Size: {image.width}x{image.height}, Clip skip: 2, ENSD: 31337"""
- except Exception:
- errors.report("Error parsing NovelAI image generation parameters", exc_info=True)
- return geninfo, items
- def is_connected():
- try:
- # connect to the host -- tells us if the host is actually reachable
- socket.create_connection(("www.google.com", 80))
- return True
- except OSError:
- pass
- return False
- def download_images(file_path, img_urls):
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537'}
- allowed_extensions = ['.jpg', '.png', '.webp', '.jpeg']
- image_info_list = []
- for i, img_url in enumerate(tqdm.tqdm(img_urls, desc="Downloading images", unit="image")):
- while True: # Keep trying until successful download or HTTP error
- if is_connected():
- try:
- parsed_url = urlparse(img_url)
- img_file_name = os.path.basename(parsed_url.path)
- img_file_name = hashlib.md5(img_file_name.encode()).hexdigest() + os.path.splitext(img_file_name)[1]
- req = urllib.request.Request(img_url, headers=headers)
- img_data = urllib.request.urlopen(req).read()
- img_file_path = os.path.join(file_path, img_file_name)
- with open(img_file_path, 'wb') as handler:
- handler.write(img_data)
- try:
- info, items = read_info_from_image(img_file_path)
- if info is not None:
- image_info_list.append({
- "instruction": img_file_path,
- "input": "",
- "output": info
- })
- except Exception as e: # Catch all exceptions
- print(f"Error reading image {img_file_path}: {e}")
- except Exception as e: # Catch all exceptions during download
- print(f"Error downloading image {img_url}: {e}. Retrying...")
- break # Successfully downloaded the image and read its info, break the loop
- else:
- print("Internet connection not available. Retrying in 5 seconds...")
- time.sleep(5)
- with open(os.path.join(os.path.dirname(file_path), f'{os.path.basename(file_path)}_info.txt'), 'w', encoding='utf-8') as info_file:
- json.dump(image_info_list, info_file, ensure_ascii=False, indent=4)
- return image_info_list
- def clean_dir(dir_name):
- # Get all .txt files in the directory
- txt_files = glob.glob(os.path.join(dir_name, "*.txt"))
- for txt_file in txt_files:
- with open(txt_file, 'r', encoding='utf-8') as f: # Specify 'utf-8' encoding
- content = f.read()
- # If content is '[]', clear the file
- if content.strip() == '[]':
- open(txt_file, 'w').close()
- def remove_empty_dirs(dir_name):
- for subdir in os.listdir(dir_name):
- subdir_path = os.path.join(dir_name, subdir)
- if os.path.isdir(subdir_path): # Check if it's a directory
- files = os.listdir(subdir_path)
- if len(files) == 0: # If directory is empty
- os.rmdir(subdir_path) # Remove the directory
- def delete_empty_files(dir_name):
- for filename in os.listdir(dir_name):
- file_path = os.path.join(dir_name, filename)
- if os.path.isfile(file_path):
- # Check if file is empty
- if os.path.getsize(file_path) == 0:
- os.remove(file_path)
- print(f'Deleted empty file: {file_path}')
- # Check if .txt file is empty
- elif filename.endswith('.txt'):
- with open(file_path, 'rb') as f:
- if not f.read(1):
- os.remove(file_path)
- print(f'Deleted empty .txt file: {file_path}')
- for filename in os.listdir(dir_name):
- if filename.endswith('.txt'):
- if filename.endswith('_links.txt'):
- os.remove(os.path.join(dir_name, filename))
- print(f'Deleted: {filename}')
- def combine_files(dir_name):
- combined_data = []
- for filename in os.listdir(dir_name):
- if filename.endswith('.txt'):
- with open(os.path.join(dir_name, filename), 'r', encoding='utf-8') as f:
- data = json.load(f)
- combined_data.extend(data)
- with open(os.path.join(dir_name, 'combined.json'), 'w', encoding='utf-8') as outfile:
- json.dump(combined_data, outfile, ensure_ascii=False, indent=4)
- def main():
- dir_name = 'SD'
- file_ext = '.html'
- html_files = [filename for filename in os.listdir(dir_name) if filename.endswith(file_ext) and not os.path.exists(os.path.join(dir_name, os.path.splitext(filename)[0]))]
- for i, filename in enumerate(tqdm.tqdm(html_files, desc="Processing HTML files", unit="file")):
- file_base_name = os.path.splitext(filename)[0]
- file_dir = os.path.join(dir_name, file_base_name)
- create_dir(file_dir)
- img_urls = parse_html(os.path.join(dir_name, filename))
- write_to_txt(f'{file_dir}_links', img_urls)
- image_info_list = download_images(file_dir, img_urls)
- clean_dir(dir_name)
- remove_empty_dirs(dir_name)
- delete_empty_files(dir_name)
- combine_files(dir_name)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement