Advertisement
nicuf

new deep_translator

Dec 4th, 2023
603
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.00 KB | None | 0 0
  1. import os
  2. import re
  3. import textwrap
  4. from deep_translator import GoogleTranslator
  5.  
  6. folder_path = r"c:\\download\\myprotein\\extracted\\"
  7. translator = GoogleTranslator(source='auto', target='ro')
  8. tags_to_translate = [
  9.     r'(<title>)(.*?)(<\/title>)',
  10.     r'(<meta name="description" content=")(.*?)("\/>)',
  11.     r'(<div class="sc-jKDlA-D hSgfYV sc-glENfF hIVUeB">)(.*?)(<\/div>)',
  12.     r'(<p>)(.*?)(<\/p>)',
  13.     r'(<h4 class="sc-jMKfon fhunKk">)(.*?)(<\/h4>)',
  14.     r'(<h2">)(.*?)(<\/h2>)'
  15.     # ... alte tag-uri, structurate similar
  16. ]
  17.  
  18. def translate_in_parts(text, translator, max_length=4800):
  19.     parts = textwrap.wrap(text, max_length, break_long_words=False)
  20.     translated = ''
  21.     for part in parts:
  22.         if part:
  23.             translated += translator.translate(part)
  24.     return translated
  25.  
  26. for filename in os.listdir(folder_path):
  27.     if filename.endswith((".html", ".htm")):
  28.         with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
  29.             html_content = file.read()
  30.  
  31.         for tag in tags_to_translate:
  32.             matches = re.finditer(tag, html_content, re.DOTALL)
  33.  
  34.             for match in matches:
  35.                 full_match = match.group(0)
  36.                 tag_start = match.group(1)
  37.                 tag_content = match.group(2)
  38.                 tag_end = match.group(3)
  39.  
  40.                 # Traducem conținutul tag-ului în fragmente
  41.                 translated_content = translate_in_parts(tag_content, translator)
  42.                 translated_tag = f"{tag_start}{translated_content}{tag_end}"
  43.  
  44.                 html_content = html_content.replace(full_match, translated_tag)
  45.  
  46.         new_filename = f"{filename.split('.')[0]}_ro.html"
  47.         translated_folder_path = os.path.join(folder_path, 'translated')
  48.         if not os.path.exists(translated_folder_path):
  49.             os.mkdir(translated_folder_path)
  50.  
  51.         with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
  52.             file.write(html_content)
  53.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement