Advertisement
nicuf

translate_python

Jan 2nd, 2022
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.14 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from bs4.formatter import HTMLFormatter
  3. import requests
  4. import re
  5. #import execjs
  6. from urllib import parse
  7. import json
  8.  
  9. class Py4Js():
  10.  
  11. def __init__(self):
  12. self.ctx = execjs.compile("""
  13. function TL(a) {
  14. var k = "";
  15. var b = 406644;
  16. var b1 = 3293161072;
  17.  
  18. var jd = ".";
  19. var $b = "+-a^+6";
  20. var Zb = "+-3^+b+-f";
  21.  
  22. for (var e = [], f = 0, g = 0; g < a.length; g++) {
  23. var m = a.charCodeAt(g);
  24. 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  25. e[f++] = m >> 18 | 240,
  26. e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  27. e[f++] = m >> 6 & 63 | 128),
  28. e[f++] = m & 63 | 128)
  29. }
  30. a = b;
  31. for (f = 0; f < e.length; f++) a += e[f],
  32. a = RL(a, $b);
  33. a = RL(a, Zb);
  34. a ^= b1 || 0;
  35. 0 > a && (a = (a & 2147483647) + 2147483648);
  36. a %= 1E6;
  37. return a.toString() + jd + (a ^ b)
  38. };
  39.  
  40. function RL(a, b) {
  41. var t = "a";
  42. var Yb = "+";
  43. for (var c = 0; c < b.length - 2; c += 3) {
  44. var d = b.charAt(c + 2),
  45. d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  46. d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  47. a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  48. }
  49. return a
  50. }
  51. """)
  52.  
  53. def getTk(self, text):
  54. return self.ctx.call("TL", text)
  55.  
  56. class Translate_as_google(object):
  57. def __init__(self, to_language, this_language='auto', read=False):
  58. '''
  59. to_language:The language to be translated into
  60. this_language:The text to be converted, the default is auto
  61. read:Generate a text reading file at the specified location
  62. '''
  63. self.this_language = this_language
  64. self.to_language = to_language
  65. self.read = read
  66.  
  67. def open_url(self, url):
  68. '''请求'''
  69. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  70. req = requests.get(url=url, headers=headers , timeout=8)
  71.  
  72. return req
  73.  
  74. def buildUrl(self):
  75. '''封装请求url
  76. sl:The text to be converted tl:The result type of the conversion qThe text to be entered'''
  77. baseUrl = 'http://translate.google.cn/translate_a/single'
  78. baseUrl += '?client=webapp&'
  79. baseUrl += 'sl=%s&' % self.this_language
  80. baseUrl += 'tl=%s&' % self.to_language
  81. baseUrl += 'hl=zh-CN&'
  82. baseUrl += 'dt=at&'
  83. baseUrl += 'dt=bd&'
  84. baseUrl += 'dt=ex&'
  85. baseUrl += 'dt=ld&'
  86. baseUrl += 'dt=md&'
  87. baseUrl += 'dt=qca&'
  88. baseUrl += 'dt=rw&'
  89. baseUrl += 'dt=rm&'
  90. baseUrl += 'dt=ss&'
  91. baseUrl += 'dt=t&'
  92. baseUrl += 'ie=UTF-8&'
  93. baseUrl += 'oe=UTF-8&'
  94. baseUrl += 'clearbtn=1&'
  95. baseUrl += 'otf=1&'
  96. baseUrl += 'pc=1&'
  97. baseUrl += 'srcrom=0&'
  98. baseUrl += 'ssel=0&'
  99. baseUrl += 'tsel=0&'
  100. baseUrl += 'kc=2&'
  101. baseUrl += 'tk=' + str(self.tk) + '&'
  102. baseUrl += 'q=' + parse.quote(self.text)
  103. return baseUrl
  104.  
  105. def read_go(self, args):
  106. '''Speaking interception
  107. upload:Download to path and file name
  108. return_language:Language type returned
  109. '''
  110. upload, return_language = args[0], args[1]
  111. read_translate_url = 'http://translate.google.cn/translate_tts?ie=UTF-8&q=%s&tl=%s&total=1&idx=0&textlen=3&tk=%s&client=webapp&prev=input' % (
  112. self.text, return_language, self.tk)
  113. data = self.open_url(read_translate_url) #Return all data requested
  114. with open(upload, 'wb') as f:
  115. f.write(data.content)
  116.  
  117. def translate(self,text):
  118. '''Translation interception'''
  119. self.text = text
  120. js = Py4Js()
  121. self.tk = js.getTk(self.text)
  122.  
  123. if len(self.text) > 4891:
  124. raise ("The length of the translation exceeds the limit!!!")
  125. url = self.buildUrl()
  126. # print(url)
  127. _result = self.open_url(url)
  128. data = _result.content.decode('utf-8')
  129.  
  130. tmp = json.loads(data)
  131. jsonArray = tmp[0]
  132. result = None
  133. for jsonItem in jsonArray:
  134. if jsonItem[0]:
  135. if result:
  136. result = result + " " + jsonItem[0]
  137. else:
  138. result = jsonItem[0]
  139. return result
  140.  
  141. class UnsortedAttributes(HTMLFormatter):
  142. def attributes(self, tag):
  143. for k, v in tag.attrs.items():
  144. yield k, v
  145.  
  146. # Path to english folder
  147. english_folder = r"c:\Folder1\5\en"
  148.  
  149. # Path to french folder
  150. french_folder = r"c:\Folder1\5\fr"
  151.  
  152. source_language = 'en'
  153.  
  154. destination_language = 'fr'
  155.  
  156. extension_file = ".html"
  157.  
  158. use_translate_folder = True
  159.  
  160. import os
  161.  
  162. en_directory = os.fsencode(english_folder)
  163. fr_directory = os.fsencode(french_folder)
  164.  
  165. def recursively_translate(node):
  166. for x in range(len(node.contents)):
  167. if isinstance(node.contents[x], str):
  168. if node.contents[x].strip() != '':
  169. try:
  170. node.contents[x].replaceWith(translator.translate(node.contents[x], src=source_language, dest=destination_language).text)
  171. except:
  172. pass
  173. elif node.contents[x] != None:
  174. recursively_translate(node.contents[x])
  175.  
  176. print('Going through english folder')
  177. for file in os.listdir(en_directory):
  178. filename = os.fsdecode(file)
  179. print(filename)
  180. if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
  181. continue
  182. if filename.endswith(extension_file):
  183. with open(os.path.join(english_folder, filename), encoding='utf-8') as html:
  184. html = html.read()
  185. fr_file = re.search('/fr/(\S+)"', html)[1]
  186.  
  187. try:
  188. with open(os.path.join(french_folder, fr_file), encoding='utf-8') as fr_html:
  189. fr_html = fr_html.read()
  190.  
  191. title = re.search('<title.+/title>', html)[0]
  192. meta = re.search('<meta name="description".+/>', html)[0]
  193. comment_body = re.search('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', html, flags=re.DOTALL)[0]
  194.  
  195. fr_html = re.sub('<!-- ARTICOL START -->.+<!-- ARTICOL FINAL -->', comment_body, fr_html, flags=re.DOTALL)
  196. fr_html = re.sub('<meta name="description".+/>', meta, fr_html)
  197. fr_html = re.sub('<title.+/title>', title, fr_html)
  198. parsed_html = fr_html
  199.  
  200. soup = BeautifulSoup('<pre>' + fr_html + '</pre>', 'html.parser')
  201.  
  202. for title in soup.findAll('title'):
  203. recursively_translate(title)
  204.  
  205. for meta in soup.findAll('meta', {'name':'description'}):
  206. try:
  207. meta['content'] = translator.translate(meta['content'], src=source_language, dest=destination_language).text
  208. except:
  209. pass
  210.  
  211. for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_articol'):
  212. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  213. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  214. if begin_comment < str(soup).index(str(h1)) < end_comment:
  215. recursively_translate(h1)
  216.  
  217. for p in soup.findAll('p', class_='text_obisnuit'):
  218. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  219. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  220. if begin_comment < str(soup).index(str(p)) < end_comment:
  221. recursively_translate(p)
  222.  
  223. for p in soup.findAll('p', class_='text_obisnuit2'):
  224. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  225. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  226. if begin_comment < str(soup).index(str(p)) < end_comment:
  227. recursively_translate(p)
  228.  
  229. for span in soup.findAll('span', class_='text_obisnuit2'):
  230. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  231. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  232. if begin_comment < str(soup).index(str(span)) < end_comment:
  233. recursively_translate(span)
  234.  
  235. for li in soup.findAll('li', class_='text_obisnuit'):
  236. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  237. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  238. if begin_comment < str(soup).index(str(li)) < end_comment:
  239. recursively_translate(li)
  240.  
  241. for a in soup.findAll('a', class_='linkMare'):
  242. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  243. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  244. if begin_comment < str(soup).index(str(a)) < end_comment:
  245. recursively_translate(a)
  246.  
  247. for h4 in soup.findAll('h4', class_='text_obisnuit2'):
  248. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  249. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  250. if begin_comment < str(soup).index(str(h4)) < end_comment:
  251. recursively_translate(h4)
  252.  
  253. for h5 in soup.findAll('h5', class_='text_obisnuit2'):
  254. begin_comment = str(soup).index('<!-- ARTICOL START -->')
  255. end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
  256. if begin_comment < str(soup).index(str(h5)) < end_comment:
  257. recursively_translate(h5)
  258. except FileNotFoundError:
  259. continue
  260.  
  261. print(f'{fr_file} parsed and translated')
  262. soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
  263. if use_translate_folder:
  264. try:
  265. with open(os.path.join(french_folder+r'\parsed+translated', 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as new_html:
  266. new_html.write(soup[5:-6])
  267. except:
  268. os.mkdir(french_folder+r'\parsed+translated')
  269. with open(os.path.join(french_folder+r'\parsed+translated', 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as new_html:
  270. new_html.write(soup[5:-6])
  271. else:
  272. with open(os.path.join(french_folder, 'parsed+translated_'+fr_file), 'w', encoding='utf-8') as html:
  273. html.write(soup[5:-6])
  274.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement