Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from bs4.formatter import HTMLFormatter
- import requests
- import re
- #import execjs
- from urllib import parse
- import json
- class Py4Js():
- def __init__(self):
- self.ctx = execjs.compile("""
- function TL(a) {
- var k = "";
- var b = 406644;
- var b1 = 3293161072;
- var jd = ".";
- var $b = "+-a^+6";
- var Zb = "+-3^+b+-f";
- for (var e = [], f = 0, g = 0; g < a.length; g++) {
- var m = a.charCodeAt(g);
- 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
- e[f++] = m >> 18 | 240,
- e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
- e[f++] = m >> 6 & 63 | 128),
- e[f++] = m & 63 | 128)
- }
- a = b;
- for (f = 0; f < e.length; f++) a += e[f],
- a = RL(a, $b);
- a = RL(a, Zb);
- a ^= b1 || 0;
- 0 > a && (a = (a & 2147483647) + 2147483648);
- a %= 1E6;
- return a.toString() + jd + (a ^ b)
- };
- function RL(a, b) {
- var t = "a";
- var Yb = "+";
- for (var c = 0; c < b.length - 2; c += 3) {
- var d = b.charAt(c + 2),
- d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
- d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
- a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
- }
- return a
- }
- """)
- def getTk(self, text):
- return self.ctx.call("TL", text)
- class Translate_as_google(object):
- def __init__(self, to_language, this_language='auto', read=False):
- '''
- to_language:The language to be translated into
- this_language:The text to be converted, the default is auto
- read:Generate a text reading file at the specified location
- '''
- self.this_language = this_language
- self.to_language = to_language
- self.read = read
- def open_url(self, url):
- '''请求'''
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
- req = requests.get(url=url, headers=headers , timeout=8)
- return req
- def buildUrl(self):
- '''封装请求url
- sl:The text to be converted tl:The result type of the conversion qThe text to be entered'''
- baseUrl = 'http://translate.google.cn/translate_a/single'
- baseUrl += '?client=webapp&'
- baseUrl += 'sl=%s&' % self.this_language
- baseUrl += 'tl=%s&' % self.to_language
- baseUrl += 'hl=zh-CN&'
- baseUrl += 'dt=at&'
- baseUrl += 'dt=bd&'
- baseUrl += 'dt=ex&'
- baseUrl += 'dt=ld&'
- baseUrl += 'dt=md&'
- baseUrl += 'dt=qca&'
- baseUrl += 'dt=rw&'
- baseUrl += 'dt=rm&'
- baseUrl += 'dt=ss&'
- baseUrl += 'dt=t&'
- baseUrl += 'ie=UTF-8&'
- baseUrl += 'oe=UTF-8&'
- baseUrl += 'clearbtn=1&'
- baseUrl += 'otf=1&'
- baseUrl += 'pc=1&'
- baseUrl += 'srcrom=0&'
- baseUrl += 'ssel=0&'
- baseUrl += 'tsel=0&'
- baseUrl += 'kc=2&'
- baseUrl += 'tk=' + str(self.tk) + '&'
- baseUrl += 'q=' + parse.quote(self.text)
- return baseUrl
- def read_go(self, args):
- '''Speaking interception
- upload:Download to path and file name
- return_language:Language type returned
- '''
- upload, return_language = args[0], args[1]
- read_translate_url = 'http://translate.google.cn/translate_tts?ie=UTF-8&q=%s&tl=%s&total=1&idx=0&textlen=3&tk=%s&client=webapp&prev=input' % (
- self.text, return_language, self.tk)
- data = self.open_url(read_translate_url) #Return all data requested
- with open(upload, 'wb') as f:
- f.write(data.content)
- def translate(self,text):
- '''Translation interception'''
- self.text = text
- js = Py4Js()
- self.tk = js.getTk(self.text)
- if len(self.text) > 4891:
- raise ("The length of the translation exceeds the limit!!!")
- url = self.buildUrl()
- # print(url)
- _result = self.open_url(url)
- data = _result.content.decode('utf-8')
- tmp = json.loads(data)
- jsonArray = tmp[0]
- result = None
- for jsonItem in jsonArray:
- if jsonItem[0]:
- if result:
- result = result + " " + jsonItem[0]
- else:
- result = jsonItem[0]
- return result
- class UnsortedAttributes(HTMLFormatter):
- def attributes(self, tag):
- for k, v in tag.attrs.items():
- yield k, v
- # Path to english folder
- english_folder = r"c:\Folder1\5\en"
- # Path to french folder
- french_folder = r"c:\Folder1\5\fr"
- source_language = 'en'
- destination_language = 'fr'
- extension_file = ".html"
- use_translate_folder = True
- import os
- en_directory = os.fsencode(english_folder)
- fr_directory = os.fsencode(french_folder)
- def recursively_translate(node):
- for x in range(len(node.contents)):
- if isinstance(node.contents[x], str):
- if node.contents[x].strip() != '':
- try:
- node.contents[x].replaceWith(translator.translate(node.contents[x], src=source_language, dest=destination_language).text)
- except:
- pass
- elif node.contents[x] != None:
- recursively_translate(node.contents[x])
- print('Going through english folder')
- for file in os.listdir(en_directory):
- filename = os.fsdecode(file)
- print(filename)
- if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
- continue
- if filename.endswith(extension_file):
- with open(os.path.join(english_folder, filename), encoding='utf-8') as html:
- html = html.read()
- fr_file = re.search('/fr/(\S+)"', html)[1]
- try:
- with open(os.path.join(french_folder, fr_file), encoding='utf-8') as fr_html:
- fr_html = fr_html.read()
- title = re.search('<title.+/title>', html)[0]
- meta = re.search('<meta name="description".+/>', html)[0]
- comment_body = re.search('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', html, flags=re.DOTALL)[0]
- fr_html = re.sub('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', comment_body, fr_html, flags=re.DOTALL)
- fr_html = re.sub('<meta name="description".+/>', meta, fr_html)
- fr_html = re.sub('<title.+/title>', title, fr_html)
- parsed_html = fr_html
- soup = BeautifulSoup('<pre>' + fr_html + '</pre>', 'html.parser')
- for title in soup.findAll('title'):
- recursively_translate(title)
- for meta in soup.findAll('meta', {'name':'description'}):
- try:
- meta['content'] = translator.translate(meta['content'], src=source_language, dest=destination_language).text
- except:
- pass
- for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_articol'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(h1)) < end_comment:
- recursively_translate(h1)
- for p in soup.findAll('p', class_='text_obisnuit'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(p)) < end_comment:
- recursively_translate(p)
- for p in soup.findAll('p', class_='text_obisnuit2'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(p)) < end_comment:
- recursively_translate(p)
- for span in soup.findAll('span', class_='text_obisnuit2'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(span)) < end_comment:
- recursively_translate(span)
- for li in soup.findAll('li', class_='text_obisnuit'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(li)) < end_comment:
- recursively_translate(li)
- for a in soup.findAll('a', class_='linkMare'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(a)) < end_comment:
- recursively_translate(a)
- for h4 in soup.findAll('h4', class_='text_obisnuit2'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(h4)) < end_comment:
- recursively_translate(h4)
- for h5 in soup.findAll('h5', class_='text_obisnuit2'):
- begin_comment = str(soup).index('<!-- ARTICOL START -->')
- end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
- if begin_comment < str(soup).index(str(h5)) < end_comment:
- recursively_translate(h5)
- except FileNotFoundError:
- continue
- print(f'{fr_file} parsed and translated')
- soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
- if use_translate_folder:
- try:
- with open(os.path.join(french_folder+r'\parsed+translated', 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as new_html:
- new_html.write(soup[5:-6])
- except:
- os.mkdir(french_folder+r'\parsed+translated')
- with open(os.path.join(french_folder+r'\parsed+translated', 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as new_html:
- new_html.write(soup[5:-6])
- else:
- with open(os.path.join(french_folder, 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as html:
- html.write(soup[5:-6])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement