Python - HTML save <title> as link

from bs4 import BeautifulSoup
from collections import defaultdict
from bs4.formatter import HTMLFormatter
import requests
import re
import execjs
from urllib import parse
import json
import os
from unidecode import unidecode  # Importă unidecode

def normalize_title(title):
    # Transliterați titlul pentru a elimina diacriticele și apoi faceți fiecare literă de la început de cuvânt cu majusculă
    return unidecode(title).title()

def get_file_hash(file_path):
    with open(file_path, encoding='utf-8') as file:
        file_content = file.read()
        return hash(file_content)

def delete_duplicate_files(directory):
    file_hashes = defaultdict(list)

    # Group files by their content hash
    for file in os.listdir(directory):
        filename = os.path.join(directory, file)
        if os.path.isfile(filename):
            file_hash = get_file_hash(filename)
            file_hashes[file_hash].append(filename)

    # Delete duplicate files
    for _, files in file_hashes.items():
        if len(files) > 1:
            # Remove all but the first file
            for file in files[1:]:
                os.remove(file)
                print(f"Deleted duplicate file: {file}")


# Directory-ul în care vrei să aplici căutarea și înlocuirea cu regex
directory = "c:\\Folder-Oana\\extracted\\translated"

# Sterge fisierele care au acelasi TITLE si pastreaza varianta fisierului a carui marime este mai mare

def delete_duplicate_files_by_title(directory):
    title_to_files = defaultdict(list)

    # Group files by their <title> content
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                file_content = file.read()
                title_match = re.search(r'<title>(.*?)</title>', file_content, re.DOTALL)
                if title_match:
                    title = title_match.group(1)
                    title_to_files[title].append(file_path)

    # Delete duplicate files for each title
    for title, files in title_to_files.items():
        if len(files) > 1:
            # Find the file with the largest size
            largest_file = max(files, key=lambda f: os.path.getsize(f))

            # Remove all but the largest file
            for file in files:
                if file != largest_file:
                    os.remove(file)
                    print(f"Deleted duplicate file: {file} with title: {title}")

# Call the function to delete duplicate files by title
delete_duplicate_files_by_title(directory)


# Lista cu regex-urile și înlocuirile corespunzătoare
regex_and_replace = [
    (r"\\\\.*$", "", 0),
    (r"\\.*\\$", "", 0),
    (r"\\.*$", "", 0),
    (r"\\\\.*$", "", re.MULTILINE),
    (r"\\.*\\$", "", re.MULTILINE),
    (r"\\.*$", "", re.MULTILINE),
    (r"\u200b", "", re.MULTILINE),
    (r"\u206A|\u206B", "", re.MULTILINE),
    (r'html></p>', 'html" />', re.MULTILINE),
    (r'<p><link rel="canonical"', '<link rel="canonical"', 0),
    (r'<p><p class=', '<p class=', 0),
    (r"<p></p>", "", 0),
    # Adaugă alte regex-uri și înlocuiri aici
]

# Parcurge toate fișierele din director
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    # Verifică dacă este un fișier și obține conținutul
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()  # Define 'file_content' for each file

            # Găsește conținutul tagului <title>
            title_match = re.search(r'<title>(.*?)</title>', file_content, re.DOTALL)
            if title_match:
                title = title_match.group(1)

                # Normalizează titlul
                normalized_title = normalize_title(title)

                # Înlocuiește titlul original cu cel normalizat în conținutul fișierului
                file_content = file_content.replace(title, normalized_title)

                # Scrie conținutul actualizat înapoi în fișier
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(file_content)

        print(f"Procesat fișierul: {filename}")

# Call the function to delete duplicate files
delete_duplicate_files(directory)


class UnsortedAttributes(HTMLFormatter):
    def attributes(self, tag):
        for k, v in tag.attrs.items():
            yield k, v


def read_text_from_file(file_path):
    """
    Aceasta functie returneaza continutul unui fisier.
    file_path: calea catre fisierul din care vrei sa citesti
    """
    with open(file_path, encoding='utf8') as f:
        text = f.read()
        return text

def write_to_file(text, file_path, encoding='utf8'):
    """
    Aceasta functie scrie un text intr-un fisier.
    text: textul pe care vrei sa il scrii
    file_path: calea catre fisierul in care vrei sa scrii
    """
    with open(file_path, 'wb') as f:
        f.write(text.encode('utf-8', 'ignore'))

# directory = "c:\\Folder3\\translated"

extension_file = ".html"

directory = os.fsencode(directory)

amount = 1
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
        continue

    if filename.endswith(extension_file):
        current_file_name = ''
        new_file_name = ''

        with open(os.path.join(directory.decode(), filename), encoding='utf-8') as html:
            file_text = html.read()
            soup = BeautifulSoup('<pre>' + file_text + '</pre>', 'html.parser')
            text_title = soup.findAll('title')[0].get_text()

            print(f'{filename} changed filename ({amount})')
            amount += 1
            new_filename = text_title
            # replace 's
            new_filename = re.sub('\'\w', '', new_filename)
            new_filename = new_filename.lower()
            words = re.findall(r'\w+', new_filename)
            new_filename = '-'.join(words)
            new_filename = new_filename + '.html'
            new_filename = os.fsdecode(new_filename)

            # Transliterați numele de fișier pentru a elimina diacriticele
            new_filename = unidecode(new_filename)

            # inlocuire nume fisier
            current_file_name = os.path.join(directory.decode(), filename)
            new_file_name = os.path.join(directory.decode(), new_filename)

            if os.path.exists(new_file_name):
                # Append a suffix to the new file name
                base_name, extension = os.path.splitext(new_filename)
                suffix = 1
                while os.path.exists(new_file_name):
                    new_filename = f"{base_name}_{suffix}{extension}"
                    new_file_name = os.path.join(directory.decode(), new_filename)
                    suffix += 1

            canonical_pattern = re.compile('<link rel="canonical" href="(.*?)>')
            canonical = re.findall(canonical_pattern, file_text)
            if len(canonical) > 0:
                canonical = canonical[0]
                link_nou = "https://trinketbox.ro/" + '-'.join(words) + ".html"
                file_text = file_text.replace(canonical, link_nou)
                write_to_file(file_text, current_file_name)
            else:
                print("Nu am gasit tag-ul canonical in fisier")

        html.close()
        os.rename(current_file_name, new_file_name)