Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from bs4.formatter import HTMLFormatter
- from googletrans import Translator
- import requests
- import re
- translator = Translator()
- class UnsortedAttributes(HTMLFormatter):
- def attributes(self, tag):
- for k, v in tag.attrs.items():
- yield k, v
- files_from_folder = r"c:\\Folder3\\translated"
- use_translate_folder = True
- destination_language = 'ro'
- extension_file = ".html"
- pattern0 = r'<meta name="description" content=".*(\b()\b.*){0,}.*/>'
- #pattern1 = r'<p class="text_obisnuit">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
- #pattern2 = r'<p class="text_obisnuit2">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
- #pattern3 = r'<title>.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</title>'
- #pattern4 = r'<meta name="description" content=.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){0,}.*>'
- #pattern5 = r'<li class=.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</li>'
- #pattern6 = r'<p class="alertHd">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</p>'
- pattern7 = r'<p class="mb-40px">.*(\b(que|vista|porque|aquí|tiene|esto|dos|uno|tres|cuatro|a la|las|están|cinco|seis|siete|diez|tenía|luego|ve|vio|también|que|que|debe|hacer|otro|obtiene|hará|hará|hecho|suyo|puede|puede|parecer|para|mientras|que|estos|dejen|preguntar|como|ganado|guardar|pero|todo|sin|pensar|sobre|solo|para|cada|intentar|soy|ellos|uno|más|mucho|hoy|queda|como|los|puede|haber)\b.*){2,2}.*</p>'
- #pattern8 = r'class="color-black">.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){0,}.*</a></h3>'
- #pattern8 = r'<h3\x20.*(\b(the|you|which|view|because|here|have|this|two|one|three|four|five|six|seven|ten|had|then|see|saw|also|than|that|must|make|from|else|does|get|will|make|made|yours|can|your|doesn|their|could|from|at|of|my|an|by|with|his|him|she|he|it|may|seem|and|for|else|while|which|these|let|ask|has|as|won|keep|but|everything|without|thinking|about|just|to|doesn|if|each|try|I’m|them|one|more|much|on|all|even|over|seems|was|where|were|who|our|most|cause|be)\b.*){10,10}.*</h3>'
- '''
- # LIMBA ROMANA
- pattern0 = r'<h1\x20.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</h1>'
- pattern1 = r'<p class="text_obisnuit">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
- pattern2 = r'<p class="text_obisnuit2">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
- pattern3 = r'<title>.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</title>'
- pattern4 = r'<meta name="description" content=.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*>'
- pattern5 = r'<li class=.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</li>'
- pattern6 = r'<p class="alertHd">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
- pattern7 = r'<p class="mb-40px">.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</p>'
- pattern8 = r'<h3\x20.*(\b(in|la|unei|si|se|de|prin|unde|care|al|prea|lui|din|ai|unui|acei|doar|tine|ale|sau|dintre|intre|cu|ce|va|fi|este|cand|o|cine|aceasta|ca|dar|iar|fara|asta|pe|tu|nu|mai|ne|le|intr-o|cum|esti|intr-un|altfel|obtine|facut|ta|voastra|putea|meu|sunt|el|ea|poate|parea|pentru|altceva|ceva|timp|acestia|pastreze|totul|daca|fiecare|ei|unul|mult|pe|toate|chiar|peste|pare|fost|noi|cel|voi|vezi|aici|au|acest|doi|unu|trei|patru|cinci|sase|sapte|zece|avut|avea|avand|stie|stia|atunci|vazut|vad|asemenea|decat|aceea|trebuie|faca|face|facand|numai|mergem|merge|mearga|duce)\b.*){4,}.*</h3>'
- '''
- # PATTERNS
- patterns = [pattern0]
- patterns = [pattern7]
- #patterns = [pattern0, pattern1, pattern2, pattern3, pattern4, pattern5, pattern6, pattern7, pattern8]
- import os
- directory = os.fsencode(files_from_folder)
- def recursively_translate(node):
- for x in range(len(node.contents)):
- if isinstance(node.contents[x], str):
- if node.contents[x].strip() != '':
- try:
- translation = translator.translate(node.contents[x], dest=destination_language).text
- node.contents[x].replaceWith(translation)
- except Exception as e:
- print(e)
- elif node.contents[x] != None:
- recursively_translate(node.contents[x])
- for file in os.listdir(directory):
- filename = os.fsdecode(file)
- print(filename)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
- page = html.read()
- updated = False
- for pattern in patterns:
- for x in re.finditer(pattern, page):
- updated = True
- new = x.group(0)
- soup = BeautifulSoup(new, 'html.parser')
- if pattern != pattern0:
- recursively_translate(soup)
- else:
- meta = soup.find('meta')
- meta['content'] = translator.translate(meta['content'], dest=destination_language).text
- soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
- page = page.replace(new, soup)
- if updated:
- print(f'{filename} translated')
- new_filename = f'{filename.split(".")[0]}_{destination_language}.html'
- if use_translate_folder:
- try:
- with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
- new_html.write(page)
- except:
- os.mkdir(files_from_folder+r'\translated')
- with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
- new_html.write(page)
- else:
- with open(os.path.join(files_from_folder, new_filename), 'w', encoding='utf-8') as html:
- html.write(page)
Add Comment
Please, Sign In to add comment