nicuf

Python Translate html tags from webpages with deep_translator (google translate)

Mar 10th, 2024
40
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.45 KB | None | 0 0
  1. import os
  2. import re
  3. import time
  4. from deep_translator import GoogleTranslator
  5. from deep_translator.exceptions import RequestError
  6. from dotenv import load_dotenv
  7.  
  8. # Load environment variables from .env file
  9. load_dotenv()
  10.  
  11. # Initialize the Deep Translator Translator for Romanian language
  12. translator = GoogleTranslator(source='auto', target='ja')
  13.  
  14. # Initialize a counter for the translated files
  15. translated_files_count = 0
  16.  
  17.  
  18.  
  19. # Folder path containing the HTML files
  20. folder_path = r"c:\\Folder-Oana\\extracted\\" # calea fisierelor en originale. Nu schimbi nimic aici
  21.  
  22.  
  23.  
  24.  
  25. # Dimensiunea minimă a fișierului în kilobytes (1.3 KB)
  26. min_file_size_kb = 1.3
  27.  
  28. # Parcurgeți toate fișierele din director
  29. for filename in os.listdir(folder_path):
  30.     file_path = os.path.join(folder_path, filename)
  31.  
  32.     # Verificați dacă este un fișier și obțineți dimensiunea sa
  33.     if os.path.isfile(file_path):
  34.         file_size_kb = os.path.getsize(file_path) / 1024  # Dimensiunea fișierului în KB
  35.  
  36.         # Dacă fișierul este mai mic decât dimensiunea minimă, ștergeți-l
  37.         if file_size_kb < min_file_size_kb:
  38.             print(f"Șterg fișierul {filename} deoarece este mai mic decât 1.3 KB.")
  39.             os.remove(file_path)
  40.  
  41.         with open(file_path, 'rb') as file:
  42.             try:
  43.                 html_content = file.read().decode('utf-8')
  44.             except UnicodeDecodeError:
  45.                 # Încercați alte seturi de caractere, cum ar fi 'ISO-8859-1' sau 'latin-1'
  46.                 html_content = file.read().decode('ISO-8859-1')
  47.  
  48.  
  49.  
  50.  
  51. # HTML tags to translate
  52. tags_to_translate = [
  53.     r'(<title>)(.*?)(<\/title>)',
  54.     r'(<meta name="description" content=")(.*?)(>)',
  55.     r'(<h4)(.*?)(<\/h4>)',
  56.     r'(<h5)(.*?)(<\/h5>)',
  57.     r'(<h6)(.*?)(<\/h6>)',
  58.     r'(<h3)(.*?)(<\/h3)',
  59.     r'(<h1)(.*?)(<\/h1)',
  60.     r'(<h2)(.*?)(<\/h2)',
  61.     r'(<p class="text_obisnuit2">)(.*?)(<\/p>)',
  62.     r'(<p class="text_obisnuit">)(.*?)(<\/p>)',
  63.     r'(<p class="mb-35px)(.*?)(</p>)',
  64.     # Adaugă regex-ul tău pentru <p class="book-description">
  65.     r'(<p class="book-description">)(.*?)(<\/p>)',
  66.     r'(html" class="color-grey">)(.*?)(</a>)'
  67. ]
  68.  
  69. def find_html_files_only_in_folder(folder, subfolder):
  70.     folder_files = set(os.listdir(folder))
  71.     subfolder_files = set(os.listdir(subfolder))
  72.  
  73.     html_files_only_in_folder = {f for f in folder_files - subfolder_files if f.endswith('.html')}
  74.  
  75.     return html_files_only_in_folder
  76.  
  77. def translate_in_parts(text, translator, max_length=4800, max_retries=5):
  78.     translated = ''
  79.     while text:
  80.         part = text[:max_length]
  81.         text = text[max_length:]
  82.  
  83.         attempt = 0
  84.         while attempt < max_retries:
  85.             try:
  86.                 translated_part = translator.translate(part)
  87.                 translated += translated_part
  88.                 break
  89.             except RequestError:
  90.                 attempt += 1
  91.                 time.sleep(1)
  92.                 print(f"Reîncercarea {attempt} pentru fragmentul: {part[:30]}...")
  93.     return translated
  94.  
  95. def apply_regex_before_translation(content):
  96.     # Adaugă regex-uri de căutare și înlocuire aici
  97.     # Exemplu: content = re.sub(r'pattern', 'replacement', content)
  98.  
  99.  
  100.  
  101.     content = re.sub(r'”></p>', r'">', content, flags=re.MULTILINE)
  102.     content = re.sub(r'<p><meta', r'<meta', content, flags=re.MULTILINE)
  103.  
  104.     # Remove paragraphs with less than 10 words (multi-line)
  105.     content = re.sub(r"<p>.{0,9}<\/p>", "", content, flags=re.MULTILINE)
  106.  
  107.     # Remove leading > from the first paragraph
  108.     content = re.sub(r"^>", "", content)
  109.  
  110.     # Remove U+200B non-breaking space (hair space)
  111.     content = re.sub(r"\x200B", "", content)
  112.     content = re.sub(r"\u00C2", "", content)
  113.     content = re.sub(r"\u001C", "", content)  # NSBN
  114.     content = re.sub(r"<p>\d+</p>", "", content)
  115.  
  116.     # Remove escaped characters
  117.     content = re.sub(r"\\\\.*$", "", content)
  118.  
  119.  
  120.  
  121.     content = re.sub(r'^\s', '', content, flags=re.MULTILINE)
  122.  
  123.  
  124.     content = re.sub(r'<p><title>', '<title>', content)
  125.     content = re.sub(r'<p></p>', '', content)
  126.     content = re.sub(r'</title></p>', '</title>', content)
  127.     content = re.sub(r'<p><meta name=', '<meta name=', content)
  128.     content = re.sub(r'"/></p>', '"/>', content)
  129.     content = re.sub(r'<p><p>', '<p>', content, flags=re.MULTILINE)
  130.     content = re.sub(r'</p></p>', '</p>', content, flags=re.MULTILINE)
  131.     content = re.sub(r'^</p>$', '', content, flags=re.MULTILINE)  # sterge orice <p> gol de la inceputul linilor
  132.     content = re.sub(r'<p style=".*?">', '<p>', content, flags=re.MULTILINE)
  133.     content = re.sub(r'<font.*?">', '', content, flags=re.MULTILINE)
  134.     content = re.sub(r'</font>', '', content, flags=re.MULTILINE)
  135.     content = re.sub(r'"\/"', '"', content, flags=re.MULTILINE)
  136.     content = re.sub(r'<strong>|</strong>', ' ', content, flags=re.MULTILINE)
  137.     content = re.sub(r'<p>\d+</p>', '', content, flags=re.MULTILINE)
  138.     content = re.sub(r'‘|’', '', content, flags=re.MULTILINE)
  139.     content = re.sub(r'”/></p>', '"/>', content, flags=re.MULTILINE)
  140.     content = re.sub(r'”|“', '', content, flags=re.MULTILINE)
  141.  
  142.     return content
  143.  
  144.  
  145.  
  146.  
  147. def translate_html_tags(file_path, translator, subfolder_path, ignored_files, translated_files):
  148.     global translated_tags_count
  149.     translated_tags_count = 0  # Reset the global counter for each file
  150.  
  151.     file_name = os.path.basename(file_path)
  152.  
  153.     if not file_name.endswith('.html'):
  154.         ignored_files.append(file_name)
  155.         print(f"Ignored file: {file_name}")
  156.         return None
  157.  
  158.     print(f"Translating file: {file_name}")  # Print the file being translated
  159.  
  160.     local_tag_count = 0  # Local counter for tags within this file
  161.  
  162.     translated_file_path = os.path.join(subfolder_path, file_name.rsplit('.', 1)[0] + '_ja.html')  # schimbi aici sufixul _ de final
  163.  
  164.     with open(file_path, 'r', encoding='utf-8') as file:
  165.         content = file.read()
  166.  
  167.     content = apply_regex_before_translation(content)
  168.  
  169.     for tag_regex in tags_to_translate:
  170.         matches = re.finditer(tag_regex, content, re.DOTALL)
  171.         for match in matches:
  172.             local_tag_count += 1  # Increment local tag counter
  173.             tag_start = match.group(1)
  174.             tag_content = match.group(2)
  175.             tag_end = match.group(3)
  176.  
  177.             print(f"Translating tag {local_tag_count}: {tag_start}...{tag_end}")
  178.  
  179.             translated_content = translate_in_parts(tag_content, translator)
  180.             translated_tag = f"{tag_start}{translated_content}{tag_end}"
  181.  
  182.             content = content.replace(f"{tag_start}{tag_content}{tag_end}", translated_tag)
  183.             translated_tags_count += 1
  184.  
  185.  
  186.  
  187.  
  188.     # Aplică adăugarea de taguri <p> pentru linii de text
  189.  
  190.  
  191.     # Aplică regex-uri DUPA TRADUCERE !!!!!!!
  192.  
  193.  
  194.     # content = remove_zwsp(content)
  195.     content = re.sub(r'<p></p>', '', content, flags=re.MULTILINE)
  196.     content = re.sub(r'<p>\d+</p>', '', content, flags=re.MULTILINE)
  197.     content = re.sub(r'”/></p>', '"/>', content, flags=re.MULTILINE)
  198.     content = re.sub(r'/></p>', '/>', content, flags=re.MULTILINE)
  199.     content = re.sub(r'»>|»\.>', '">', content, flags=re.MULTILINE)
  200.     content = re.sub(r'<p>                    <p', '<p', content, flags=re.MULTILINE)
  201.     content = re.sub(r'<p>        <p', '<p', content, flags=re.MULTILINE)
  202.  
  203.  
  204.  
  205.  
  206.  
  207.     with open(translated_file_path, 'w', encoding='utf-8') as file:
  208.         file.write(content)
  209.  
  210.     translated_files.append(translated_file_path)
  211.     print(f"Translated file: {translated_file_path}")
  212.     return translated_file_path
  213.  
  214.  
  215.  
  216.  
  217.  
  218.  
  219.  
  220.  
  221. if __name__ == "__main__":
  222.     folder_path = r"c:\Folder-Oana\extracted"
  223.     subfolder_path = r"c:\Folder-Oana\extracted\translated"  # schimbi aici folderul in care vrei sa puna traducerea
  224.  
  225.     ignored_files_list = []
  226.     translated_files_list = []
  227.  
  228.     # Asigură-te că subfolderul există
  229.     if not os.path.exists(subfolder_path):
  230.         os.makedirs(subfolder_path)
  231.  
  232.     html_files_to_translate = find_html_files_only_in_folder(folder_path, subfolder_path)
  233.     for file_name in html_files_to_translate:
  234.         translated_file = translate_html_tags(
  235.             os.path.join(folder_path, file_name),
  236.             translator,
  237.             subfolder_path,
  238.             ignored_files_list,
  239.             translated_files_list
  240.         )
  241.  
  242.     print(f"Ignored files: {ignored_files_list}")
  243.     print(f"Translated files: {translated_files_list}")
  244.     print(f"Total translated tags: {translated_tags_count}")
Add Comment
Please, Sign In to add comment