Advertisement
nicuf

HTML Translate paragraphs with googletrans by Keywords

Apr 29th, 2023
1,274
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.89 KB | None | 0 0
  1. # ROMANIAN: https://neculaifantanaru.com/regex-python-translate-beautifulsoup-googletrans-html-tags-contains-keywords.html
  2.  
  3. # ENGLISH: https://neculaifantanaru.com/en/regex-python-translate-beautifulsoup-googletrans-html-tags-contains-keywords.html
  4.  
  5.  
  6. '''
  7. # pip install googletrans==4.0.0rc1
  8.  
  9. def read_file(_path):
  10.    with open(_path, "r", encoding="utf-8") as f:
  11.        _html_str = f.read()
  12.        return _html_str
  13.  
  14.  
  15. def write_file(_path, _str):
  16.    with open(_path, "w", encoding="utf-8") as f:
  17.        f.write(_str)
  18. '''
  19.  
  20.  
  21. from bs4 import BeautifulSoup
  22. from bs4.formatter import HTMLFormatter
  23. from googletrans import Translator
  24. import requests
  25.  
  26. translator = Translator()
  27.  
  28. class UnsortedAttributes(HTMLFormatter):
  29.     def attributes(self, tag):
  30.         for k, v in tag.attrs.items():
  31.             yield k, v
  32.  
  33. files_from_folder = r"c:\\Folder3"
  34.  
  35. use_translate_folder = True
  36.  
  37. destination_language = 'ro'
  38.  
  39. extension_file = ".html"
  40.  
  41. import os
  42.  
  43. directory = os.fsencode(files_from_folder)
  44.  
  45. def recursively_translate(node):
  46.     for x in range(len(node.contents)):
  47.         if isinstance(node.contents[x], str):
  48.             if node.contents[x].strip() != '':
  49.                 try:
  50.                     node.contents[x].replaceWith(translator.translate(node.contents[x], dest=destination_language).text)
  51.                 except:
  52.                     pass
  53.         elif node.contents[x] != None:
  54.             recursively_translate(node.contents[x])
  55.  
  56. amount = 1
  57. for file in os.listdir(directory):
  58.     filename = os.fsdecode(file)
  59.     print(filename)
  60.     if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  61.         continue
  62.     if filename.endswith(extension_file):
  63.         with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
  64.             soup = BeautifulSoup('<pre>' + html.read() + '</pre>', 'html.parser')
  65.             for title in soup.findAll('title'):
  66.                 recursively_translate(title)
  67.  
  68.             for meta in soup.findAll('meta', {'name':'description'}):
  69.                 try:
  70.                     meta['content'] = translator.translate(meta['content'], dest=destination_language).text
  71.                 except:
  72.                     pass
  73.  
  74.             for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_articol'):
  75.                 if begin_comment < str(soup).index(str(h1)) < end_comment:
  76.                     recursively_translate(h1)
  77.  
  78.             for p in soup.findAll('p', class_='text_obisnuit'):
  79.                 if begin_comment < str(soup).index(str(p)) < end_comment:
  80.                     recursively_translate(p)
  81.  
  82.             for p in soup.findAll('p', class_='text_obisnuit2'):
  83.  
  84.                     recursively_translate(p)
  85.  
  86.             for p in soup.findAll('p', class_='NOU'):
  87.                 recursively_translate(p)
  88.  
  89.             for span in soup.findAll('span', class_='text_obisnuit2'):
  90.                 if begin_comment < str(soup).index(str(span)) < end_comment:
  91.                     recursively_translate(span)
  92.  
  93.             for li in soup.findAll('li', class_='text_obisnuit'):
  94.                 if begin_comment < str(soup).index(str(li)) < end_comment:
  95.                     recursively_translate(li)
  96.  
  97.             for a in soup.findAll('a', class_='linkMare'):
  98.                 if begin_comment < str(soup).index(str(a)) < end_comment:
  99.                     recursively_translate(a)
  100.  
  101.             for h4 in soup.findAll('h4', class_='text_obisnuit2'):
  102.                 if begin_comment < str(soup).index(str(h4)) < end_comment:
  103.                     recursively_translate(h4)
  104.  
  105.             for h5 in soup.findAll('h5', class_='text_obisnuit2'):
  106.                 if begin_comment < str(soup).index(str(h5)) < end_comment:
  107.                     recursively_translate(h5)
  108.  
  109.             for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_webinar'):
  110.                 if begin_comment < str(soup).index(str(h1)) < end_comment:
  111.                     recursively_translate(h1)
  112.  
  113.             for h3 in soup.findAll('h3', class_='font-weight-normal'):
  114.                 if begin_comment < str(soup).index(str(h3)) < end_comment:
  115.                     recursively_translate(h3)
  116.  
  117.             for h3 in soup.findAll('h3', class_='font-weight-normal'):
  118.                 if begin_comment < str(soup).index(str(h3)) < end_comment:
  119.                     recursively_translate(h3)
  120.  
  121.             for span in soup.findAll('span', class_='online'):
  122.                 begin_comment = str(soup).index('<!-- post -->')
  123.                 end_comment = str(soup).index('<!-- ARTICOL START -->')
  124.                 if begin_comment < str(soup).index(str(span)) < end_comment:
  125.                     recursively_translate(span)
  126.  
  127.             for p in soup.findAll('p', class_='mb-40px'):
  128.                 if begin_comment < str(soup).index(str(p)) < end_comment:
  129.                     recursively_translate(p)
  130.  
  131.             for p in soup.findAll('p', class_='mb-35px color-grey line-height-25px'):
  132.                 if begin_comment < str(soup).index(str(p)) < end_comment:
  133.                     recursively_translate(p)
  134.  
  135.         print(f'{filename} translated ({amount})')
  136.         amount += 1
  137.         soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
  138.         new_filename = f'{filename.split(".")[0]}_{destination_language}.html'
  139.         if use_translate_folder:
  140.             try:
  141.                 with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  142.                     new_html.write(soup[5:-6])
  143.             except:
  144.                 os.mkdir(files_from_folder+r'\translated')
  145.                 with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  146.                     new_html.write(soup[5:-6])
  147.         else:
  148.             with open(os.path.join(files_from_folder, new_filename), 'w', encoding='utf-8') as html:
  149.                 html.write(soup[5:-6])
  150.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement