nicuf

translate text files with google

Feb 28th, 2022 (edited)
292
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.95 KB | None | 0 0
  1. -----------
  2. EXPLANATION:
  3.  
  4. ROMANIAN: https://neculaifantanaru.com/example-python-google-translate-any-text-html-file-version-2.html
  5. ENGLISH: https://neculaifantanaru.com/en/example-python-google-translate-any-text-html-file-version-2.html
  6. -----------
  7.  
  8. # TREBUIE SA FAC UN FOLDER "translated"
  9. # -*- encoding: utf-8 -*-
  10. '''
  11. @File    :  google_trans.py
  12. @Time    :  2020/5/15 9:29
  13. @Author  :  hxluo
  14. @Version :  1.0
  15. @Contact :  465801795@qq.com
  16. @Desc    :  google translate
  17.  
  18. '''
  19. # import lib
  20. from bs4 import BeautifulSoup
  21. from urllib import parse
  22. import re
  23. import os
  24. from bs4.formatter import HTMLFormatter
  25. import requests
  26. import execjs
  27. import json
  28. import random
  29. import unidecode
  30. import nltk
  31. from nltk import tokenize
  32. # nltk.download('punkt')
  33.  
  34. class Py4Js():
  35.  
  36.     def __init__(self):
  37.         self.ctx = execjs.compile("""
  38.        function TL(a) {
  39.        var k = "";
  40.        var b = 406644;
  41.        var b1 = 3293161072;
  42.  
  43.        var jd = ".";
  44.        var $b = "+-a^+6";
  45.        var Zb = "+-3^+b+-f";
  46.  
  47.        for (var e = [], f = 0, g = 0; g < a.length; g++) {
  48.            var m = a.charCodeAt(g);
  49.            128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
  50.            e[f++] = m >> 18 | 240,
  51.            e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
  52.            e[f++] = m >> 6 & 63 | 128),
  53.            e[f++] = m & 63 | 128)
  54.        }
  55.        a = b;
  56.        for (f = 0; f < e.length; f++) a += e[f],
  57.        a = RL(a, $b);
  58.        a = RL(a, Zb);
  59.        a ^= b1 || 0;
  60.        0 > a && (a = (a & 2147483647) + 2147483648);
  61.        a %= 1E6;
  62.        return a.toString() + jd + (a ^ b)
  63.    };
  64.  
  65.    function RL(a, b) {
  66.        var t = "a";
  67.        var Yb = "+";
  68.        for (var c = 0; c < b.length - 2; c += 3) {
  69.            var d = b.charAt(c + 2),
  70.            d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
  71.            d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
  72.            a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
  73.        }
  74.        return a
  75.    }
  76.    """)
  77.  
  78.     def getTk(self, text):
  79.         return self.ctx.call("TL", text)
  80.  
  81. class Translate_as_google(object):
  82.     def __init__(self, to_language, this_language='auto', read=False):
  83.         '''
  84.            to_language:The language to be translated into
  85.            this_language:The text to be converted, the default is auto
  86.            read:Generate a text reading file at the specified location
  87.        '''
  88.         self.this_language = this_language
  89.         self.to_language = to_language
  90.         self.read = read
  91.  
  92.     def open_url(self, url):
  93.         '''请求'''
  94.         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
  95.         req = requests.get(url=url, headers=headers , timeout=8)
  96.  
  97.         return req
  98.  
  99.     def buildUrl(self):
  100.         '''封装请求url
  101.            sl:The text to be converted tl:The result type of the conversion qThe text to be entered'''
  102.         baseUrl = 'http://translate.google.cn/translate_a/single'
  103.         baseUrl += '?client=webapp&'
  104.         baseUrl += 'sl=%s&' % self.this_language
  105.         baseUrl += 'tl=%s&' % self.to_language
  106.         baseUrl += 'hl=zh-CN&'
  107.         baseUrl += 'dt=at&'
  108.         baseUrl += 'dt=bd&'
  109.         baseUrl += 'dt=ex&'
  110.         baseUrl += 'dt=ld&'
  111.         baseUrl += 'dt=md&'
  112.         baseUrl += 'dt=qca&'
  113.         baseUrl += 'dt=rw&'
  114.         baseUrl += 'dt=rm&'
  115.         baseUrl += 'dt=ss&'
  116.         baseUrl += 'dt=t&'
  117.         baseUrl += 'ie=UTF-8&'
  118.         baseUrl += 'oe=UTF-8&'
  119.         baseUrl += 'clearbtn=1&'
  120.         baseUrl += 'otf=1&'
  121.         baseUrl += 'pc=1&'
  122.         baseUrl += 'srcrom=0&'
  123.         baseUrl += 'ssel=0&'
  124.         baseUrl += 'tsel=0&'
  125.         baseUrl += 'kc=2&'
  126.         baseUrl += 'tk=' + str(self.tk) + '&'
  127.         baseUrl += 'q=' + parse.quote(self.text)
  128.         return baseUrl
  129.  
  130.     def read_go(self, args):
  131.         '''Speaking interception
  132.        upload:Download to path and file name
  133.        return_language:Language type returned
  134.        '''
  135.         upload, return_language = args[0], args[1]
  136.         read_translate_url = 'http://translate.google.cn/translate_tts?ie=UTF-8&q=%s&tl=%s&total=1&idx=0&textlen=3&tk=%s&client=webapp&prev=input' % (
  137.             self.text, return_language, self.tk)
  138.         data = self.open_url(read_translate_url) #Return all data requested
  139.         with open(upload, 'wb') as f:
  140.             f.write(data.content)
  141.  
  142.     def translate(self,text):
  143.         '''Translation interception'''
  144.         self.text = text
  145.         js = Py4Js()
  146.         self.tk = js.getTk(self.text)
  147.  
  148.         if len(self.text) > 4891:
  149.             raise ("The length of the translation exceeds the limit!!!")
  150.         url = self.buildUrl()
  151.         # print(url)
  152.         _result = self.open_url(url)
  153.         data = _result.content.decode('utf-8')
  154.  
  155.         tmp = json.loads(data)
  156.         jsonArray = tmp[0]
  157.         result = None
  158.         for jsonItem in jsonArray:
  159.             if jsonItem[0]:
  160.                 if result:
  161.                     result = result + " " + jsonItem[0]
  162.                 else:
  163.                     result = jsonItem[0]
  164.         return result
  165.  
  166. class UnsortedAttributes(HTMLFormatter):
  167.     def attributes(self, tag):
  168.         for k, v in tag.attrs.items():
  169.             yield k, v
  170.  
  171. def scoate_spatii_inceput_fisier(directory):
  172.     for filename in os.listdir('c:\\Folder3\\translated'):
  173.         if filename.endswith(".txt"):
  174.             with open(os.path.join(directory, filename), encoding='utf-8') as f:
  175.                 lines = f.readlines()
  176.                 lines_without_spaces = list()
  177.                 for line in lines:
  178.                     lines_without_spaces.append(line.lstrip())
  179.                 lines_without_spaces = '\n'.join(lines_without_spaces)
  180.                 with open(os.path.join(directory, filename), 'w', encoding='utf-8') as g:
  181.                     g.write(lines_without_spaces)
  182.  
  183. if __name__ == '__main__':
  184.  
  185.     source = 'ro' # put the language from the text file
  186.     target = 'en' # put the language in which you want to translate
  187.     directory = "c:\\Folder3"  #  SCHIMBA SI JOS la ultima linie directorul
  188.     count = 0
  189.     for filename in os.listdir(directory):
  190.         if filename.endswith(".txt"): #or filename.endswith(".png"):
  191.             count += 1
  192.             print("Current file: ", filename)
  193.             with open(os.path.join(directory, filename), encoding='utf-8') as f:
  194.                 file_text = f.read()
  195.  
  196.                 # impartire in propozitii
  197.                 propozitii = tokenize.sent_tokenize(file_text)
  198.                 propozitii = [prop.strip().capitalize() for prop in propozitii]
  199.                 propozitii = [prop[:-1].strip() + prop[-1] for prop in propozitii]
  200.  
  201.                 limita_caractere = 4891
  202.                 text_tradus = ''
  203.                 bucata_text = ''
  204.                 ts = Translate_as_google(target, source)
  205.  
  206.                 for propozitie in propozitii:
  207.                     if len(bucata_text) + len(propozitie) < limita_caractere:
  208.                         if bucata_text == '':
  209.                             bucata_text = bucata_text + propozitie
  210.                         else:
  211.                             bucata_text = bucata_text + ' ' + propozitie
  212.                     else:
  213.                         text_tradus += ts.translate(bucata_text)
  214.                         bucata_text = ''
  215.                 if (len(bucata_text) < limita_caractere):
  216.                     text_tradus += ts.translate(bucata_text)
  217.  
  218.                 with open(directory + "\\translated" + "\\" + filename.split('.')[0] + '_{}'.format(target) + '.' + filename.split('.')[1], 'w', encoding='utf-8') as f:
  219.                     f.write(text_tradus)
  220.         else:
  221.             continue
  222.  
  223.     print("Fisiere modificate: ", count)
  224.  
  225.     # scoate spatii fisiere
  226.     scoate_spatii_inceput_fisier("c:\\Folder3\\translated")
  227.  
Add Comment
Please, Sign In to add comment