Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from collections import defaultdict
- from bs4.formatter import HTMLFormatter
- import requests
- import re
- import execjs
- from urllib import parse
- import json
- import os
- from unidecode import unidecode # Importă unidecode
- def normalize_title(title):
- # Transliterați titlul pentru a elimina diacriticele și apoi faceți fiecare literă de la început de cuvânt cu majusculă
- return unidecode(title).title()
- def get_file_hash(file_path):
- with open(file_path, encoding='utf-8') as file:
- file_content = file.read()
- return hash(file_content)
- def delete_duplicate_files(directory):
- file_hashes = defaultdict(list)
- # Group files by their content hash
- for file in os.listdir(directory):
- filename = os.path.join(directory, file)
- if os.path.isfile(filename):
- file_hash = get_file_hash(filename)
- file_hashes[file_hash].append(filename)
- # Delete duplicate files
- for _, files in file_hashes.items():
- if len(files) > 1:
- # Remove all but the first file
- for file in files[1:]:
- os.remove(file)
- print(f"Deleted duplicate file: {file}")
- # Directory-ul în care vrei să aplici căutarea și înlocuirea cu regex
- directory = "c:\\Folder-Oana\\extracted\\translated"
- # Sterge fisierele care au acelasi TITLE si pastreaza varianta fisierului a carui marime este mai mare
- def delete_duplicate_files_by_title(directory):
- title_to_files = defaultdict(list)
- # Group files by their <title> content
- for filename in os.listdir(directory):
- file_path = os.path.join(directory, filename)
- if os.path.isfile(file_path):
- with open(file_path, 'r', encoding='utf-8') as file:
- file_content = file.read()
- title_match = re.search(r'<title>(.*?)</title>', file_content, re.DOTALL)
- if title_match:
- title = title_match.group(1)
- title_to_files[title].append(file_path)
- # Delete duplicate files for each title
- for title, files in title_to_files.items():
- if len(files) > 1:
- # Find the file with the largest size
- largest_file = max(files, key=lambda f: os.path.getsize(f))
- # Remove all but the largest file
- for file in files:
- if file != largest_file:
- os.remove(file)
- print(f"Deleted duplicate file: {file} with title: {title}")
- # Call the function to delete duplicate files by title
- delete_duplicate_files_by_title(directory)
- # Lista cu regex-urile și înlocuirile corespunzătoare
- regex_and_replace = [
- (r"\\\\.*$", "", 0),
- (r"\\.*\\$", "", 0),
- (r"\\.*$", "", 0),
- (r"\\\\.*$", "", re.MULTILINE),
- (r"\\.*\\$", "", re.MULTILINE),
- (r"\\.*$", "", re.MULTILINE),
- (r"\u200b", "", re.MULTILINE),
- (r"\u206A|\u206B", "", re.MULTILINE),
- (r'html></p>', 'html" />', re.MULTILINE),
- (r'<p><link rel="canonical"', '<link rel="canonical"', 0),
- (r'<p><p class=', '<p class=', 0),
- (r"<p></p>", "", 0),
- # Adaugă alte regex-uri și înlocuiri aici
- ]
- # Parcurge toate fișierele din director
- for filename in os.listdir(directory):
- file_path = os.path.join(directory, filename)
- # Verifică dacă este un fișier și obține conținutul
- if os.path.isfile(file_path):
- with open(file_path, 'r', encoding='utf-8') as file:
- file_content = file.read() # Define 'file_content' for each file
- # Găsește conținutul tagului <title>
- title_match = re.search(r'<title>(.*?)</title>', file_content, re.DOTALL)
- if title_match:
- title = title_match.group(1)
- # Normalizează titlul
- normalized_title = normalize_title(title)
- # Înlocuiește titlul original cu cel normalizat în conținutul fișierului
- file_content = file_content.replace(title, normalized_title)
- # Scrie conținutul actualizat înapoi în fișier
- with open(file_path, 'w', encoding='utf-8') as file:
- file.write(file_content)
- print(f"Procesat fișierul: {filename}")
- # Call the function to delete duplicate files
- delete_duplicate_files(directory)
- class UnsortedAttributes(HTMLFormatter):
- def attributes(self, tag):
- for k, v in tag.attrs.items():
- yield k, v
- def read_text_from_file(file_path):
- """
- Aceasta functie returneaza continutul unui fisier.
- file_path: calea catre fisierul din care vrei sa citesti
- """
- with open(file_path, encoding='utf8') as f:
- text = f.read()
- return text
- def write_to_file(text, file_path, encoding='utf8'):
- """
- Aceasta functie scrie un text intr-un fisier.
- text: textul pe care vrei sa il scrii
- file_path: calea catre fisierul in care vrei sa scrii
- """
- with open(file_path, 'wb') as f:
- f.write(text.encode('utf-8', 'ignore'))
- # directory = "c:\\Folder3\\translated"
- extension_file = ".html"
- directory = os.fsencode(directory)
- amount = 1
- for file in os.listdir(directory):
- filename = os.fsdecode(file)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- current_file_name = ''
- new_file_name = ''
- with open(os.path.join(directory.decode(), filename), encoding='utf-8') as html:
- file_text = html.read()
- soup = BeautifulSoup('<pre>' + file_text + '</pre>', 'html.parser')
- text_title = soup.findAll('title')[0].get_text()
- print(f'{filename} changed filename ({amount})')
- amount += 1
- new_filename = text_title
- # replace 's
- new_filename = re.sub('\'\w', '', new_filename)
- new_filename = new_filename.lower()
- words = re.findall(r'\w+', new_filename)
- new_filename = '-'.join(words)
- new_filename = new_filename + '.html'
- new_filename = os.fsdecode(new_filename)
- # Transliterați numele de fișier pentru a elimina diacriticele
- new_filename = unidecode(new_filename)
- # inlocuire nume fisier
- current_file_name = os.path.join(directory.decode(), filename)
- new_file_name = os.path.join(directory.decode(), new_filename)
- if os.path.exists(new_file_name):
- # Append a suffix to the new file name
- base_name, extension = os.path.splitext(new_filename)
- suffix = 1
- while os.path.exists(new_file_name):
- new_filename = f"{base_name}_{suffix}{extension}"
- new_file_name = os.path.join(directory.decode(), new_filename)
- suffix += 1
- canonical_pattern = re.compile('<link rel="canonical" href="(.*?)>')
- canonical = re.findall(canonical_pattern, file_text)
- if len(canonical) > 0:
- canonical = canonical[0]
- link_nou = "https://trinketbox.ro/" + '-'.join(words) + ".html"
- file_text = file_text.replace(canonical, link_nou)
- write_to_file(file_text, current_file_name)
- else:
- print("Nu am gasit tag-ul canonical in fisier")
- html.close()
- os.rename(current_file_name, new_file_name)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement