Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -----------
- EXPLANATION:
- ROMANIAN: https://neculaifantanaru.com/example-python-google-translate-any-text-html-file-version-2.html
- ENGLISH: https://neculaifantanaru.com/en/example-python-google-translate-any-text-html-file-version-2.html
- -----------
- # TREBUIE SA FAC UN FOLDER "translated"
- # -*- encoding: utf-8 -*-
- '''
- @File : google_trans.py
- @Time : 2020/5/15 9:29
- @Author : hxluo
- @Version : 1.0
- @Contact : 465801795@qq.com
- @Desc : google translate
- '''
- # import lib
- from bs4 import BeautifulSoup
- from urllib import parse
- import re
- import os
- from bs4.formatter import HTMLFormatter
- import requests
- import execjs
- import json
- import random
- import unidecode
- import nltk
- from nltk import tokenize
- # nltk.download('punkt')
- class Py4Js():
- def __init__(self):
- self.ctx = execjs.compile("""
- function TL(a) {
- var k = "";
- var b = 406644;
- var b1 = 3293161072;
- var jd = ".";
- var $b = "+-a^+6";
- var Zb = "+-3^+b+-f";
- for (var e = [], f = 0, g = 0; g < a.length; g++) {
- var m = a.charCodeAt(g);
- 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
- e[f++] = m >> 18 | 240,
- e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
- e[f++] = m >> 6 & 63 | 128),
- e[f++] = m & 63 | 128)
- }
- a = b;
- for (f = 0; f < e.length; f++) a += e[f],
- a = RL(a, $b);
- a = RL(a, Zb);
- a ^= b1 || 0;
- 0 > a && (a = (a & 2147483647) + 2147483648);
- a %= 1E6;
- return a.toString() + jd + (a ^ b)
- };
- function RL(a, b) {
- var t = "a";
- var Yb = "+";
- for (var c = 0; c < b.length - 2; c += 3) {
- var d = b.charAt(c + 2),
- d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
- d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
- a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
- }
- return a
- }
- """)
- def getTk(self, text):
- return self.ctx.call("TL", text)
- class Translate_as_google(object):
- def __init__(self, to_language, this_language='auto', read=False):
- '''
- to_language:The language to be translated into
- this_language:The text to be converted, the default is auto
- read:Generate a text reading file at the specified location
- '''
- self.this_language = this_language
- self.to_language = to_language
- self.read = read
- def open_url(self, url):
- '''请求'''
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
- req = requests.get(url=url, headers=headers , timeout=8)
- return req
- def buildUrl(self):
- '''封装请求url
- sl:The text to be converted tl:The result type of the conversion qThe text to be entered'''
- baseUrl = 'http://translate.google.cn/translate_a/single'
- baseUrl += '?client=webapp&'
- baseUrl += 'sl=%s&' % self.this_language
- baseUrl += 'tl=%s&' % self.to_language
- baseUrl += 'hl=zh-CN&'
- baseUrl += 'dt=at&'
- baseUrl += 'dt=bd&'
- baseUrl += 'dt=ex&'
- baseUrl += 'dt=ld&'
- baseUrl += 'dt=md&'
- baseUrl += 'dt=qca&'
- baseUrl += 'dt=rw&'
- baseUrl += 'dt=rm&'
- baseUrl += 'dt=ss&'
- baseUrl += 'dt=t&'
- baseUrl += 'ie=UTF-8&'
- baseUrl += 'oe=UTF-8&'
- baseUrl += 'clearbtn=1&'
- baseUrl += 'otf=1&'
- baseUrl += 'pc=1&'
- baseUrl += 'srcrom=0&'
- baseUrl += 'ssel=0&'
- baseUrl += 'tsel=0&'
- baseUrl += 'kc=2&'
- baseUrl += 'tk=' + str(self.tk) + '&'
- baseUrl += 'q=' + parse.quote(self.text)
- return baseUrl
- def read_go(self, args):
- '''Speaking interception
- upload:Download to path and file name
- return_language:Language type returned
- '''
- upload, return_language = args[0], args[1]
- read_translate_url = 'http://translate.google.cn/translate_tts?ie=UTF-8&q=%s&tl=%s&total=1&idx=0&textlen=3&tk=%s&client=webapp&prev=input' % (
- self.text, return_language, self.tk)
- data = self.open_url(read_translate_url) #Return all data requested
- with open(upload, 'wb') as f:
- f.write(data.content)
- def translate(self,text):
- '''Translation interception'''
- self.text = text
- js = Py4Js()
- self.tk = js.getTk(self.text)
- if len(self.text) > 4891:
- raise ("The length of the translation exceeds the limit!!!")
- url = self.buildUrl()
- # print(url)
- _result = self.open_url(url)
- data = _result.content.decode('utf-8')
- tmp = json.loads(data)
- jsonArray = tmp[0]
- result = None
- for jsonItem in jsonArray:
- if jsonItem[0]:
- if result:
- result = result + " " + jsonItem[0]
- else:
- result = jsonItem[0]
- return result
- class UnsortedAttributes(HTMLFormatter):
- def attributes(self, tag):
- for k, v in tag.attrs.items():
- yield k, v
- def scoate_spatii_inceput_fisier(directory):
- for filename in os.listdir('c:\\Folder3\\translated'):
- if filename.endswith(".txt"):
- with open(os.path.join(directory, filename), encoding='utf-8') as f:
- lines = f.readlines()
- lines_without_spaces = list()
- for line in lines:
- lines_without_spaces.append(line.lstrip())
- lines_without_spaces = '\n'.join(lines_without_spaces)
- with open(os.path.join(directory, filename), 'w', encoding='utf-8') as g:
- g.write(lines_without_spaces)
- if __name__ == '__main__':
- source = 'ro' # put the language from the text file
- target = 'en' # put the language in which you want to translate
- directory = "c:\\Folder3" # SCHIMBA SI JOS la ultima linie directorul
- count = 0
- for filename in os.listdir(directory):
- if filename.endswith(".txt"): #or filename.endswith(".png"):
- count += 1
- print("Current file: ", filename)
- with open(os.path.join(directory, filename), encoding='utf-8') as f:
- file_text = f.read()
- # impartire in propozitii
- propozitii = tokenize.sent_tokenize(file_text)
- propozitii = [prop.strip().capitalize() for prop in propozitii]
- propozitii = [prop[:-1].strip() + prop[-1] for prop in propozitii]
- limita_caractere = 4891
- text_tradus = ''
- bucata_text = ''
- ts = Translate_as_google(target, source)
- for propozitie in propozitii:
- if len(bucata_text) + len(propozitie) < limita_caractere:
- if bucata_text == '':
- bucata_text = bucata_text + propozitie
- else:
- bucata_text = bucata_text + ' ' + propozitie
- else:
- text_tradus += ts.translate(bucata_text)
- bucata_text = ''
- if (len(bucata_text) < limita_caractere):
- text_tradus += ts.translate(bucata_text)
- with open(directory + "\\translated" + "\\" + filename.split('.')[0] + '_{}'.format(target) + '.' + filename.split('.')[1], 'w', encoding='utf-8') as f:
- f.write(text_tradus)
- else:
- continue
- print("Fisiere modificate: ", count)
- # scoate spatii fisiere
- scoate_spatii_inceput_fisier("c:\\Folder3\\translated")
Add Comment
Please, Sign In to add comment