1_scraptHtml.py

# colorama==0.4.6 numpy==1.26.1 piexif==1.1.3 Pillow==10.1.0 python-dateutil==2.8.2 pytz==2023.3.post1 six==1.16.0 tqdm==4.66.1 tzdata==2023.3 ...
import os, json, urllib.request, piexif, shutil, glob, imghdr, json, hashlib, time, socket, tqdm
from urllib.parse import urlparse
from html.parser import HTMLParser
from PIL import Image, UnidentifiedImageError

class ImgParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.img_urls = []

    def handle_starttag(self, tag, attrs):
        if tag == 'img':
            for attr in attrs:
                if attr[0] == 'src':
                    self.img_urls.append(attr[1])

def parse_html(file_path):
    parser = ImgParser()
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            parser.feed(f.read())
    except PermissionError:
        print(f"Permission denied: {file_path}")
        return []
    return parser.img_urls


def write_to_txt(file_path, img_urls):
    with open(f'{file_path}.txt', 'w') as f:
        for url in img_urls:
            f.write(f'{url}\n')

def create_dir(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def read_info_from_image(image_path):
    try:
        image = Image.open(image_path)
    except UnidentifiedImageError:
        print(f"Cannot identify image file {image_path}")
        return None, None

    items = image.info or {}
    geninfo = items.pop('parameters', None)

    if "exif" in items:
        exif = piexif.load(items["exif"])
        exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b'')
        try:
            exif_comment = exif_comment.decode(errors="ignore")
        except ValueError:
            exif_comment = exif_comment.decode('utf8', errors="ignore")

        if exif_comment:
            items['exif comment'] = exif_comment
            geninfo = exif_comment

    for field in ['jfif', 'jfif_version', 'jfif_unit', 'jfif_density', 'dpi', 'exif',
                    'loop', 'background', 'timestamp', 'duration', 'progressive', 'progression',
                    'icc_profile', 'chromaticity']:
        items.pop(field, None)

    if items.get("Software", None) == "NovelAI":
        try:
            json_info = json.loads(items["Comment"])
            sampler = sd_samplers.samplers_map.get(json_info["sampler"], "Euler a")

            geninfo = f"""{items["Description"]}
Negative prompt: {json_info["uc"]}
Steps: {json_info["steps"]}, Sampler: {sampler}, CFG scale: {json_info["scale"]}, Seed: {json_info["seed"]}, Size: {image.width}x{image.height}, Clip skip: 2, ENSD: 31337"""
        except Exception:
            errors.report("Error parsing NovelAI image generation parameters", exc_info=True)

    return geninfo, items

def is_connected():
    try:
        # connect to the host -- tells us if the host is actually reachable
        socket.create_connection(("www.google.com", 80))
        return True
    except OSError:
        pass
    return False

def download_images(file_path, img_urls):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537'}
    allowed_extensions = ['.jpg', '.png', '.webp', '.jpeg']
    image_info_list = []

    for i, img_url in enumerate(tqdm.tqdm(img_urls, desc="Downloading images", unit="image")):
        while True:  # Keep trying until successful download or HTTP error
            if is_connected():
                try:
                    parsed_url = urlparse(img_url)
                    img_file_name = os.path.basename(parsed_url.path)
                    img_file_name = hashlib.md5(img_file_name.encode()).hexdigest() + os.path.splitext(img_file_name)[1]
                    req = urllib.request.Request(img_url, headers=headers)
                    img_data = urllib.request.urlopen(req).read()
                    img_file_path = os.path.join(file_path, img_file_name)
                    with open(img_file_path, 'wb') as handler:
                        handler.write(img_data)
                    try:
                        info, items = read_info_from_image(img_file_path)
                        if info is not None:
                            image_info_list.append({
                                "instruction": img_file_path,
                                "input": "",
                                "output": info
                            })
                    except Exception as e:  # Catch all exceptions
                        print(f"Error reading image {img_file_path}: {e}")
                except Exception as e:  # Catch all exceptions during download
                    print(f"Error downloading image {img_url}: {e}. Retrying...")
                break  # Successfully downloaded the image and read its info, break the loop
            else:
                print("Internet connection not available. Retrying in 5 seconds...")
                time.sleep(5)
    with open(os.path.join(os.path.dirname(file_path), f'{os.path.basename(file_path)}_info.txt'), 'w', encoding='utf-8') as info_file:
        json.dump(image_info_list, info_file, ensure_ascii=False, indent=4)
    return image_info_list


def clean_dir(dir_name):
    # Get all .txt files in the directory
    txt_files = glob.glob(os.path.join(dir_name, "*.txt"))

    for txt_file in txt_files:
        with open(txt_file, 'r', encoding='utf-8') as f:  # Specify 'utf-8' encoding
            content = f.read()
            # If content is '[]', clear the file
            if content.strip() == '[]':
                open(txt_file, 'w').close()

def remove_empty_dirs(dir_name):
    for subdir in os.listdir(dir_name):
        subdir_path = os.path.join(dir_name, subdir)
        if os.path.isdir(subdir_path):  # Check if it's a directory
            files = os.listdir(subdir_path)
            if len(files) == 0:  # If directory is empty
                os.rmdir(subdir_path)  # Remove the directory

def delete_empty_files(dir_name):
    for filename in os.listdir(dir_name):
        file_path = os.path.join(dir_name, filename)
        if os.path.isfile(file_path):
            # Check if file is empty
            if os.path.getsize(file_path) == 0:
                os.remove(file_path)
                print(f'Deleted empty file: {file_path}')
            # Check if .txt file is empty
            elif filename.endswith('.txt'):
                with open(file_path, 'rb') as f:
                    if not f.read(1):
                        os.remove(file_path)
                        print(f'Deleted empty .txt file: {file_path}')
    for filename in os.listdir(dir_name):
        if filename.endswith('.txt'):
            if filename.endswith('_links.txt'):
                os.remove(os.path.join(dir_name, filename))
                print(f'Deleted: {filename}')

def combine_files(dir_name):
    combined_data = []

    for filename in os.listdir(dir_name):
        if filename.endswith('.txt'):
            with open(os.path.join(dir_name, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                combined_data.extend(data)

    with open(os.path.join(dir_name, 'combined.json'), 'w', encoding='utf-8') as outfile:
        json.dump(combined_data, outfile, ensure_ascii=False, indent=4)

def main():
    dir_name = 'SD'
    file_ext = '.html'
    html_files = [filename for filename in os.listdir(dir_name) if filename.endswith(file_ext) and not os.path.exists(os.path.join(dir_name, os.path.splitext(filename)[0]))]
    for i, filename in enumerate(tqdm.tqdm(html_files, desc="Processing HTML files", unit="file")):
        file_base_name = os.path.splitext(filename)[0]
        file_dir = os.path.join(dir_name, file_base_name)
        create_dir(file_dir)
        img_urls = parse_html(os.path.join(dir_name, filename))
        write_to_txt(f'{file_dir}_links', img_urls)
        image_info_list = download_images(file_dir, img_urls)
    clean_dir(dir_name)
    remove_empty_dirs(dir_name)
    delete_empty_files(dir_name)
    combine_files(dir_name)

if __name__ == "__main__":
    main()