Advertisement
Guest User

1_scraptHtml.py

a guest
Dec 2nd, 2023
19
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.13 KB | Source Code | 0 0
  1. # colorama==0.4.6 numpy==1.26.1 piexif==1.1.3 Pillow==10.1.0 python-dateutil==2.8.2 pytz==2023.3.post1 six==1.16.0 tqdm==4.66.1 tzdata==2023.3 ...
  2. import os, json, urllib.request, piexif, shutil, glob, imghdr, json, hashlib, time, socket, tqdm
  3. from urllib.parse import urlparse
  4. from html.parser import HTMLParser
  5. from PIL import Image, UnidentifiedImageError
  6.  
  7. class ImgParser(HTMLParser):
  8.     def __init__(self):
  9.         super().__init__()
  10.         self.img_urls = []
  11.  
  12.     def handle_starttag(self, tag, attrs):
  13.         if tag == 'img':
  14.             for attr in attrs:
  15.                 if attr[0] == 'src':
  16.                     self.img_urls.append(attr[1])
  17.  
  18. def parse_html(file_path):
  19.     parser = ImgParser()
  20.     try:
  21.         with open(file_path, 'r', encoding='utf-8') as f:
  22.             parser.feed(f.read())
  23.     except PermissionError:
  24.         print(f"Permission denied: {file_path}")
  25.         return []
  26.     return parser.img_urls
  27.  
  28.  
  29. def write_to_txt(file_path, img_urls):
  30.     with open(f'{file_path}.txt', 'w') as f:
  31.         for url in img_urls:
  32.             f.write(f'{url}\n')
  33.  
  34. def create_dir(folder_path):
  35.     if not os.path.exists(folder_path):
  36.         os.makedirs(folder_path)
  37.  
  38. def read_info_from_image(image_path):
  39.     try:
  40.         image = Image.open(image_path)
  41.     except UnidentifiedImageError:
  42.         print(f"Cannot identify image file {image_path}")
  43.         return None, None
  44.  
  45.     items = image.info or {}
  46.     geninfo = items.pop('parameters', None)
  47.  
  48.     if "exif" in items:
  49.         exif = piexif.load(items["exif"])
  50.         exif_comment = (exif or {}).get("Exif", {}).get(piexif.ExifIFD.UserComment, b'')
  51.         try:
  52.             exif_comment = exif_comment.decode(errors="ignore")
  53.         except ValueError:
  54.             exif_comment = exif_comment.decode('utf8', errors="ignore")
  55.  
  56.         if exif_comment:
  57.             items['exif comment'] = exif_comment
  58.             geninfo = exif_comment
  59.  
  60.     for field in ['jfif', 'jfif_version', 'jfif_unit', 'jfif_density', 'dpi', 'exif',
  61.                     'loop', 'background', 'timestamp', 'duration', 'progressive', 'progression',
  62.                     'icc_profile', 'chromaticity']:
  63.         items.pop(field, None)
  64.  
  65.     if items.get("Software", None) == "NovelAI":
  66.         try:
  67.             json_info = json.loads(items["Comment"])
  68.             sampler = sd_samplers.samplers_map.get(json_info["sampler"], "Euler a")
  69.  
  70.             geninfo = f"""{items["Description"]}
  71. Negative prompt: {json_info["uc"]}
  72. Steps: {json_info["steps"]}, Sampler: {sampler}, CFG scale: {json_info["scale"]}, Seed: {json_info["seed"]}, Size: {image.width}x{image.height}, Clip skip: 2, ENSD: 31337"""
  73.         except Exception:
  74.             errors.report("Error parsing NovelAI image generation parameters", exc_info=True)
  75.  
  76.     return geninfo, items
  77.  
  78. def is_connected():
  79.     try:
  80.         # connect to the host -- tells us if the host is actually reachable
  81.         socket.create_connection(("www.google.com", 80))
  82.         return True
  83.     except OSError:
  84.         pass
  85.     return False
  86.  
  87. def download_images(file_path, img_urls):
  88.     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537'}
  89.     allowed_extensions = ['.jpg', '.png', '.webp', '.jpeg']
  90.     image_info_list = []
  91.  
  92.     for i, img_url in enumerate(tqdm.tqdm(img_urls, desc="Downloading images", unit="image")):
  93.         while True:  # Keep trying until successful download or HTTP error
  94.             if is_connected():
  95.                 try:
  96.                     parsed_url = urlparse(img_url)
  97.                     img_file_name = os.path.basename(parsed_url.path)
  98.                     img_file_name = hashlib.md5(img_file_name.encode()).hexdigest() + os.path.splitext(img_file_name)[1]
  99.                     req = urllib.request.Request(img_url, headers=headers)
  100.                     img_data = urllib.request.urlopen(req).read()
  101.                     img_file_path = os.path.join(file_path, img_file_name)
  102.                     with open(img_file_path, 'wb') as handler:
  103.                         handler.write(img_data)
  104.                     try:
  105.                         info, items = read_info_from_image(img_file_path)
  106.                         if info is not None:
  107.                             image_info_list.append({
  108.                                 "instruction": img_file_path,
  109.                                 "input": "",
  110.                                 "output": info
  111.                             })
  112.                     except Exception as e:  # Catch all exceptions
  113.                         print(f"Error reading image {img_file_path}: {e}")
  114.                 except Exception as e:  # Catch all exceptions during download
  115.                     print(f"Error downloading image {img_url}: {e}. Retrying...")
  116.                 break  # Successfully downloaded the image and read its info, break the loop
  117.             else:
  118.                 print("Internet connection not available. Retrying in 5 seconds...")
  119.                 time.sleep(5)
  120.     with open(os.path.join(os.path.dirname(file_path), f'{os.path.basename(file_path)}_info.txt'), 'w', encoding='utf-8') as info_file:
  121.         json.dump(image_info_list, info_file, ensure_ascii=False, indent=4)
  122.     return image_info_list
  123.  
  124.  
  125.  
  126. def clean_dir(dir_name):
  127.     # Get all .txt files in the directory
  128.     txt_files = glob.glob(os.path.join(dir_name, "*.txt"))
  129.    
  130.     for txt_file in txt_files:
  131.         with open(txt_file, 'r', encoding='utf-8') as f:  # Specify 'utf-8' encoding
  132.             content = f.read()
  133.             # If content is '[]', clear the file
  134.             if content.strip() == '[]':
  135.                 open(txt_file, 'w').close()
  136.  
  137. def remove_empty_dirs(dir_name):
  138.     for subdir in os.listdir(dir_name):
  139.         subdir_path = os.path.join(dir_name, subdir)
  140.         if os.path.isdir(subdir_path):  # Check if it's a directory
  141.             files = os.listdir(subdir_path)
  142.             if len(files) == 0:  # If directory is empty
  143.                 os.rmdir(subdir_path)  # Remove the directory
  144.  
  145. def delete_empty_files(dir_name):
  146.     for filename in os.listdir(dir_name):
  147.         file_path = os.path.join(dir_name, filename)
  148.         if os.path.isfile(file_path):
  149.             # Check if file is empty
  150.             if os.path.getsize(file_path) == 0:
  151.                 os.remove(file_path)
  152.                 print(f'Deleted empty file: {file_path}')
  153.             # Check if .txt file is empty
  154.             elif filename.endswith('.txt'):
  155.                 with open(file_path, 'rb') as f:
  156.                     if not f.read(1):
  157.                         os.remove(file_path)
  158.                         print(f'Deleted empty .txt file: {file_path}')
  159.     for filename in os.listdir(dir_name):
  160.         if filename.endswith('.txt'):
  161.             if filename.endswith('_links.txt'):
  162.                 os.remove(os.path.join(dir_name, filename))
  163.                 print(f'Deleted: {filename}')
  164.  
  165. def combine_files(dir_name):
  166.     combined_data = []
  167.  
  168.     for filename in os.listdir(dir_name):
  169.         if filename.endswith('.txt'):
  170.             with open(os.path.join(dir_name, filename), 'r', encoding='utf-8') as f:
  171.                 data = json.load(f)
  172.                 combined_data.extend(data)
  173.  
  174.     with open(os.path.join(dir_name, 'combined.json'), 'w', encoding='utf-8') as outfile:
  175.         json.dump(combined_data, outfile, ensure_ascii=False, indent=4)
  176.  
  177. def main():
  178.     dir_name = 'SD'
  179.     file_ext = '.html'
  180.     html_files = [filename for filename in os.listdir(dir_name) if filename.endswith(file_ext) and not os.path.exists(os.path.join(dir_name, os.path.splitext(filename)[0]))]
  181.     for i, filename in enumerate(tqdm.tqdm(html_files, desc="Processing HTML files", unit="file")):
  182.         file_base_name = os.path.splitext(filename)[0]
  183.         file_dir = os.path.join(dir_name, file_base_name)
  184.         create_dir(file_dir)
  185.         img_urls = parse_html(os.path.join(dir_name, filename))
  186.         write_to_txt(f'{file_dir}_links', img_urls)
  187.         image_info_list = download_images(file_dir, img_urls)
  188.     clean_dir(dir_name)
  189.     remove_empty_dirs(dir_name)
  190.     delete_empty_files(dir_name)
  191.     combine_files(dir_name)
  192.  
  193. if __name__ == "__main__":
  194.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement