nicuf

Translate website with GoogleTrans

Apr 29th, 2023
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 12.80 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from bs4.formatter import HTMLFormatter
  3. from googletrans import Translator
  4. import requests
  5. import re
  6.  
  7. translator = Translator()
  8.  
  9. class UnsortedAttributes(HTMLFormatter):
  10.     def attributes(self, tag):
  11.         for k, v in tag.attrs.items():
  12.             yield k, v
  13.  
  14. files_from_folder = r"c:\\Folder3\\translated"
  15.  
  16. use_translate_folder = True
  17.  
  18. destination_language = 'ro'
  19.  
  20. extension_file = ".html"
  21. pattern0 = r'<meta name="description" content=".*(\b()\b.*){0,}.*/>'
  22. #pattern1 = r'<p class="text_obisnuit">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
  23. #pattern2 = r'<p class="text_obisnuit2">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
  24. #pattern3 = r'<title>.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</title>'
  25. #pattern4 = r'<meta name="description" content=.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){0,}.*>'
  26. #pattern5 = r'<li class=.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</li>'
  27. #pattern6 = r'<p class="alertHd">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
  28.  
  29. pattern7 = r'<p class="mb-40px">.*(\b(que|vista|porque|aquí|tiene|esto|dos|uno|tres|cuatro|a la|las|están|cinco|seis|siete|diez|tenía|luego|ve|vio|también|que|que|debe|hacer|otro|obtiene|hará|hará|hecho|suyo|puede|puede|parecer|para|mientras|que|estos|dejen|preguntar|como|ganado|guardar|pero|todo|sin|pensar|sobre|solo|para|cada|intentar|soy|ellos|uno|más|mucho|hoy|queda|como|los|puede|haber)\b.*){2,2}.*</p>'
  30. #pattern8 = r'class="color-black">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){0,}.*</a></h3>'
  31. #pattern8 = r'<h3\x20.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</h3>'
  32.  
  33. '''
  34. # LIMBA ROMANA
  35.  
  36. pattern0 = r'<h1\x20.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</h1>'
  37. pattern1 = r'<p class="text_obisnuit">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
  38. pattern2 = r'<p class="text_obisnuit2">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
  39. pattern3 = r'<title>.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</title>'
  40. pattern4 = r'<meta name="description" content=.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*>'
  41. pattern5 = r'<li class=.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</li>'
  42. pattern6 = r'<p class="alertHd">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
  43.  
  44. pattern7 = r'<p class="mb-40px">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
  45. pattern8 = r'<h3\x20.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</h3>'
  46. '''
  47. # PATTERNS
  48.  
  49. patterns = [pattern0]
  50. patterns = [pattern7]
  51. #patterns = [pattern0, pattern1, pattern2, pattern3, pattern4, pattern5, pattern6, pattern7, pattern8]
  52. import os
  53.  
  54. directory = os.fsencode(files_from_folder)
  55.  
  56. def recursively_translate(node):
  57.     for x in range(len(node.contents)):
  58.         if isinstance(node.contents[x], str):
  59.             if node.contents[x].strip() != '':
  60.                 try:
  61.                     translation = translator.translate(node.contents[x], dest=destination_language).text
  62.                     node.contents[x].replaceWith(translation)
  63.                 except Exception as e:
  64.                     print(e)
  65.         elif node.contents[x] != None:
  66.             recursively_translate(node.contents[x])
  67.  
  68. for file in os.listdir(directory):
  69.     filename = os.fsdecode(file)
  70.     print(filename)
  71.     if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  72.         continue
  73.     if filename.endswith(extension_file):
  74.         with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
  75.             page = html.read()
  76.             updated = False
  77.             for pattern in patterns:
  78.                 for x in re.finditer(pattern, page):
  79.                     updated = True
  80.                     new = x.group(0)
  81.                     soup = BeautifulSoup(new, 'html.parser')
  82.                     if pattern != pattern0:
  83.                         recursively_translate(soup)
  84.                     else:
  85.                         meta = soup.find('meta')
  86.                         meta['content'] = translator.translate(meta['content'], dest=destination_language).text
  87.                     soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
  88.                     page = page.replace(new, soup)
  89.         if updated:
  90.             print(f'{filename} translated')
  91.             new_filename = f'{filename.split(".")[0]}_{destination_language}.html'
  92.             if use_translate_folder:
  93.                 try:
  94.                     with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  95.                         new_html.write(page)
  96.                 except:
  97.                     os.mkdir(files_from_folder+r'\translated')
  98.                     with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  99.                         new_html.write(page)
  100.             else:
  101.                 with open(os.path.join(files_from_folder, new_filename), 'w', encoding='utf-8') as html:
  102.                     html.write(page)
  103.  
Add Comment
Please, Sign In to add comment