nicuf

Website Translate + Save Title tag as html link

Sep 25th, 2021 (edited)
680
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. For explanations
  2.  
  3. ENGLISH: https://neculaifantanaru.com/en/python-google-translate-beautifulsoup-library-save-title-tag-as-link.html
  4. ROMANIAN: https://neculaifantanaru.com/python-google-translate-beautifulsoup-library-save-title-tag-as-link.html
  5.  
  6.  
  7. from bs4 import BeautifulSoup
  8. from bs4.formatter import HTMLFormatter
  9. import requests
  10. import re
  11. import execjs
  12. from urllib import parse
  13. import json
  14.  
  15. class Py4Js():
  16.  
  17.     def __init__(self):
  18.         self.ctx = execjs.compile("""
  19.        function TL(a) {
  20.        var k = "";
  21.        var b = 406644;
  22.        var b1 = 3293161072;
  23.  
  24.        var jd = ".";
  25.        var $b = "+-a^+6";
  26.        var Zb = "+-3^+b+-f";
  27.  
  28.        for (var e = [], f = 0, g = 0; g < a.length; g++) {
  29.            var m = a.charCodeAt(g);
  30.            128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  31.            e[f++] = m >> 18 | 240,
  32.            e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  33.            e[f++] = m >> 6 & 63 | 128),
  34.            e[f++] = m & 63 | 128)
  35.        }
  36.        a = b;
  37.        for (f = 0; f < e.length; f++) a += e[f],
  38.        a = RL(a, $b);
  39.        a = RL(a, Zb);
  40.        a ^= b1 || 0;
  41.        0 > a && (a = (a & 2147483647) + 2147483648);
  42.        a %= 1E6;
  43.        return a.toString() + jd + (a ^ b)
  44.    };
  45.  
  46.    function RL(a, b) {
  47.        var t = "a";
  48.        var Yb = "+";
  49.        for (var c = 0; c < b.length - 2; c += 3) {
  50.            var d = b.charAt(c + 2),
  51.            d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  52.            d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  53.            a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  54.        }
  55.        return a
  56.    }
  57.    """)
  58.  
  59.     def getTk(self, text):
  60.         return self.ctx.call("TL", text)
  61.  
  62. class Translate_as_google(object):
  63.     def __init__(self, to_language, this_language='auto', read=False):
  64.         '''
  65.            to_language:The language to be translated into
  66.            this_language:The text to be converted, the default is auto
  67.            read:Generate a text reading file at the specified location
  68.        '''
  69.         self.this_language = this_language
  70.         self.to_language = to_language
  71.         self.read = read
  72.  
  73.     def open_url(self, url):
  74.         '''请求'''
  75.         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  76.         req = requests.get(url=url, headers=headers , timeout=8)
  77.  
  78.         return req
  79.  
  80.     def buildUrl(self):
  81.         '''封装请求url
  82.            sl:The text to be converted tl:The result type of the conversion qThe text to be entered'''
  83.         baseUrl = 'http://translate.google.cn/translate_a/single'
  84.         baseUrl += '?client=webapp&'
  85.         baseUrl += 'sl=%s&' % self.this_language
  86.         baseUrl += 'tl=%s&' % self.to_language
  87.         baseUrl += 'hl=zh-CN&'
  88.         baseUrl += 'dt=at&'
  89.         baseUrl += 'dt=bd&'
  90.         baseUrl += 'dt=ex&'
  91.         baseUrl += 'dt=ld&'
  92.         baseUrl += 'dt=md&'
  93.         baseUrl += 'dt=qca&'
  94.         baseUrl += 'dt=rw&'
  95.         baseUrl += 'dt=rm&'
  96.         baseUrl += 'dt=ss&'
  97.         baseUrl += 'dt=t&'
  98.         baseUrl += 'ie=UTF-8&'
  99.         baseUrl += 'oe=UTF-8&'
  100.         baseUrl += 'clearbtn=1&'
  101.         baseUrl += 'otf=1&'
  102.         baseUrl += 'pc=1&'
  103.         baseUrl += 'srcrom=0&'
  104.         baseUrl += 'ssel=0&'
  105.         baseUrl += 'tsel=0&'
  106.         baseUrl += 'kc=2&'
  107.         baseUrl += 'tk=' + str(self.tk) + '&'
  108.         baseUrl += 'q=' + parse.quote(self.text)
  109.         return baseUrl
  110.  
  111.     def read_go(self, args):
  112.         '''Speaking interception
  113.        upload:Download to path and file name
  114.        return_language:Language type returned
  115.        '''
  116.         upload, return_language = args[0], args[1]
  117.         read_translate_url = 'http://translate.google.cn/translate_tts?ie=UTF-8&q=%s&tl=%s&total=1&idx=0&textlen=3&tk=%s&client=webapp&prev=input' % (
  118.             self.text, return_language, self.tk)
  119.         data = self.open_url(read_translate_url) #Return all data requested
  120.         with open(upload, 'wb') as f:
  121.             f.write(data.content)
  122.  
  123.     def translate(self,text):
  124.         '''Translation interception'''
  125.         self.text = text
  126.         js = Py4Js()
  127.         self.tk = js.getTk(self.text)
  128.  
  129.         if len(self.text) > 4891:
  130.             raise ("The length of the translation exceeds the limit!!!")
  131.         url = self.buildUrl()
  132.         # print(url)
  133.         _result = self.open_url(url)
  134.         data = _result.content.decode('utf-8')
  135.  
  136.         tmp = json.loads(data)
  137.         jsonArray = tmp[0]
  138.         result = None
  139.         for jsonItem in jsonArray:
  140.             if jsonItem[0]:
  141.                 if result:
  142.                     result = result + " " + jsonItem[0]
  143.                 else:
  144.                     result = jsonItem[0]
  145.         return result
  146.  
  147. class UnsortedAttributes(HTMLFormatter):
  148.     def attributes(self, tag):
  149.         for k, v in tag.attrs.items():
  150.             yield k, v
  151.  
  152. files_from_folder = r"c:\Folder1\translated\test"
  153.  
  154. use_translate_folder = True
  155.  
  156. destination_language = 'fr'
  157.  
  158. ts = Translate_as_google(destination_language)
  159.  
  160. extension_file = ".html"
  161.  
  162. import os
  163.  
  164. directory = os.fsencode(files_from_folder)
  165.  
  166. def recursively_translate(node):
  167.     for x in range(len(node.contents)):
  168.         if isinstance(node.contents[x], str):
  169.             if node.contents[x].strip() != '':
  170.                 try:
  171.                     node.contents[x].replaceWith(ts.translate(node.contents[x]))
  172.                 except Exception as e:
  173.                     print(e)
  174.         elif node.contents[x] != None:
  175.             recursively_translate(node.contents[x])
  176.  
  177. amount = 1
  178. for file in os.listdir(directory):
  179.     filename = os.fsdecode(file)
  180.     print(filename)
  181.     if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  182.         continue
  183.     if filename.endswith(extension_file):
  184.         with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
  185.             soup = BeautifulSoup('<pre>' + html.read() + '</pre>', 'html.parser')
  186.             for title in soup.findAll('title'):
  187.                 print("Continut titlu: ", title.get_text())
  188.                 recursively_translate(title)
  189.  
  190.             for meta in soup.findAll('meta', {'name':'description'}):
  191.                 try:
  192.                     meta['content'] = ts.translate(meta['content'])
  193.                 except:
  194.                     pass
  195.  
  196.             for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_articol'):
  197.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  198.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  199.                 if begin_comment < str(soup).index(str(h1)) < end_comment:
  200.                     recursively_translate(h1)
  201.  
  202.             for p in soup.findAll('p', class_='text_obisnuit'):
  203.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  204.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  205.                 if begin_comment < str(soup).index(str(p)) < end_comment:
  206.                     recursively_translate(p)
  207.  
  208.             for p in soup.findAll('p', class_='text_obisnuit2'):
  209.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  210.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  211.                 if begin_comment < str(soup).index(str(p)) < end_comment:
  212.                     recursively_translate(p)
  213.  
  214.             for span in soup.findAll('span', class_='text_obisnuit2'):
  215.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  216.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  217.                 if begin_comment < str(soup).index(str(span)) < end_comment:
  218.                     recursively_translate(span)
  219.  
  220.             for li in soup.findAll('li', class_='text_obisnuit'):
  221.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  222.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  223.                 if begin_comment < str(soup).index(str(li)) < end_comment:
  224.                     recursively_translate(li)
  225.  
  226.             for a in soup.findAll('a', class_='linkMare'):
  227.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  228.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  229.                 if begin_comment < str(soup).index(str(a)) < end_comment:
  230.                     recursively_translate(a)
  231.  
  232.             for h4 in soup.findAll('h4', class_='text_obisnuit2'):
  233.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  234.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  235.                 if begin_comment < str(soup).index(str(h4)) < end_comment:
  236.                     recursively_translate(h4)
  237.  
  238.             for h5 in soup.findAll('h5', class_='text_obisnuit2'):
  239.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  240.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  241.                 if begin_comment < str(soup).index(str(h5)) < end_comment:
  242.                     recursively_translate(h5)
  243.  
  244.             for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_webinar'):
  245.                 begin_comment = str(soup).index('<!-- ARTICOL START -->')
  246.                 end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  247.                 if begin_comment < str(soup).index(str(h1)) < end_comment:
  248.                     recursively_translate(h1)
  249.  
  250.         # SAve Title tag as html link
  251.  
  252.         print(f'{filename} translated ({amount})')
  253.         amount += 1
  254.         soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
  255.         new_filename = title.get_text() #{}{} = acoladele sunt ca niste placeholder
  256.         new_filename = new_filename.lower()
  257.         words = re.findall(r'\w+', new_filename)
  258.         #words = re.findall(r'\b\w+\b(?=[\w\s]+\|)', new_filename)  #This will save words until | (linebreak) if you have one
  259.         new_filename = '-'.join(words)
  260.         new_filename = new_filename + '.html'
  261.         print(new_filename)
  262.  
  263.         #print("Encode: ", new_filename.encode('ascii', 'ignore'))
  264.         if use_translate_folder:
  265.             try:
  266.                 with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  267.                     new_html.write(soup[5:-6])
  268.             except:
  269.                 os.mkdir(files_from_folder+r'\translated')
  270.                 with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
  271.                     new_html.write(soup[5:-6])
  272.         else:
  273.             with open(os.path.join(files_from_folder, new_filename), 'w', encoding='utf-8') as html:
  274.                 html.write(soup[5:-6])
  275.  
RAW Paste Data