Advertisement
sonpython

googletrans.py

May 25th, 2015
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.57 KB | None | 0 0
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. '''Michael Phan: Using Google Translate for free :)
  5. '''
  6. from __future__ import print_function
  7. from __future__ import unicode_literals
  8.  
  9. import sys
  10. import os
  11. import json
  12. import itertools
  13. import functools
  14. import time
  15. import socket
  16. import random
  17. import re
  18.  
  19. try:
  20.     # python 3
  21.     from urllib.request import build_opener, Request, HTTPHandler, HTTPSHandler
  22.     from urllib.parse import quote_plus, urlencode, unquote_plus, urljoin
  23.     izip = zip
  24.  
  25. except ImportError:
  26.     # python 2
  27.     from urllib2 import build_opener, Request, HTTPHandler, HTTPSHandler
  28.     from urllib import urlencode, unquote_plus, quote_plus
  29.     from urlparse import urljoin
  30.     from itertools import izip
  31.  
  32. try:
  33.     import concurrent.futures
  34.     _g_executor = concurrent.futures.ThreadPoolExecutor(max_workers=120)
  35. except ImportError:
  36.     _g_executor = None
  37.  
  38. try:
  39.     unicode
  40. except NameError:
  41.     unicode = str
  42.    
  43. def _is_sequence(arg):
  44.     return (not isinstance(arg, unicode)) and (
  45.         not isinstance(arg, bytes)) and (
  46.         hasattr(arg, "__getitem__") or hasattr(arg, "__iter__"))
  47.    
  48. def _is_bytes(arg):
  49.     return isinstance(arg, bytes)
  50.  
  51.  
  52. def _unwrapper_single_element(elements):
  53.     if len(elements) == 1:
  54.         return elements[0]
  55.     return elements
  56.        
  57.    
  58. class Error(Exception):
  59.     '''Error type
  60.    '''
  61.     pass
  62.  
  63.  
  64. _empty_comma = re.compile(r',(?=,)')
  65.  
  66. WRITING_NATIVE = ('trans',)
  67. '''native target language writing system'''
  68.  
  69. WRITING_ROMAN = ('translit',)
  70. '''romanlized writing system. only valid for some langauges, otherwise it outputs empty string'''
  71.  
  72. WRITING_NATIVE_AND_ROMAN = WRITING_NATIVE + WRITING_ROMAN
  73. '''both native and roman writing. The output will be a tuple'''
  74.  
  75. class Googletrans(object):
  76.     '''All googletrans API lives in this class
  77.  
  78.    You have to first create an instance of Googletrans to use this API
  79.  
  80.    :param writing: The translation writing system. Currently 3 values are valid
  81.    
  82.                 - :const:`WRITING_NATIVE` for native writing system
  83.                 - :const:`WRITING_ROMAN` for roman writing system
  84.                 - :const:`WRITING_NATIVE_AND_ROMAN` for both native and roman writing system. output will be a tuple in this case
  85.    
  86.    :param opener: The url opener to be used for HTTP/HTTPS query.
  87.                   If not provide, a default opener will be used.
  88.                   For proxy support you should provide an ``opener`` with ``ProxyHandler``
  89.    :type opener: `urllib2.OpenerDirector <http://docs.python.org/2/library/urllib2.html#urllib2.OpenerDirector>`_
  90.        
  91.    :param retry_times: how many times to retry when connection reset error occured. Default to 4
  92.    :type retry_times: int
  93.        
  94.    :type max_workers: int
  95.  
  96.    :param timeout: HTTP request timeout in seconds
  97.    :type timeout: int/float
  98.    
  99.    :param debug: Turn on/off the debug output
  100.    :type debug: bool
  101.  
  102.    :param service_urls: google translate url list. URLs will be used randomly for better concurrent performance. For example ``['http://translate.google.com', 'http://translate.google.de']``
  103.    :type service_urls: single string or a sequence of strings
  104.    
  105.    :param executor: the multi thread executor for handling batch input, default to a global ``futures.ThreadPoolExecutor`` instance with 120 max thead workers if ``futures`` is avalible. Set to None to disable multi thread support
  106.    :type executor: ``futures.ThreadPoolExecutor``
  107.    
  108.    .. note:: multi thread worker relys on `futures <https://pypi.python.org/pypi/futures>`_, if it is not avalible, ``googletrans`` will work under single thread mode
  109.    
  110.    :Example:
  111.  
  112.        >>> import googletrans
  113.        >>>
  114.        >>> # Create a Googletrans instance first
  115.        >>> gs = googletrans.Googletrans()
  116.        >>>
  117.        >>> # You could get all supported language list through get_languages
  118.        >>> languages = gs.get_languages()
  119.        >>> print(languages['en'])
  120.        English
  121.        >>>
  122.        >>> # Tranlate English into German
  123.        >>> print(gs.translate('hello', 'de'))
  124.        hallo
  125.        >>> # Detect the language of the text
  126.        >>> print(gs.detect('some English words'))
  127.        en
  128.        >>> # Get googletrans object dedicated for romanlized translation (romanlization)
  129.        >>> gs_roman = googletrans.Googletrans(WRITING_ROMAN)
  130.        >>> print(gs_roman.translate('hello', 'zh'))
  131.        Nín hǎo
  132.    '''
  133.  
  134.    
  135.     _MAX_LENGTH_PER_QUERY = 1800
  136.  
  137.     def __init__(self, writing=WRITING_NATIVE, opener=None, retry_times=4, executor=_g_executor,
  138.                  timeout=4, service_urls=('http://translate.google.com',), debug=False):
  139.         self._DEBUG = debug
  140.         self._MIN_TASKS_FOR_CONCURRENT = 2
  141.         self._opener = opener
  142.         self._languages = None
  143.         self._TIMEOUT = timeout
  144.         if not self._opener:
  145.             debuglevel = self._DEBUG and 1 or 0
  146.             self._opener = build_opener(
  147.                 HTTPHandler(debuglevel=debuglevel),
  148.                 HTTPSHandler(debuglevel=debuglevel))
  149.        
  150.         self._RETRY_TIMES = retry_times
  151.         self._executor = executor
  152.         self._writing = writing
  153.         if _is_sequence(service_urls):
  154.             self._service_urls = service_urls
  155.         else:
  156.             self._service_urls = (service_urls,)
  157.  
  158.     def _open_url(self, url):
  159.         if len(url) > self._MAX_LENGTH_PER_QUERY+100:
  160.             raise Error('input too large')
  161.  
  162.         # Google forbits urllib2 User-Agent: Python-urllib/2.7
  163.         request = Request(url, headers={'User-Agent':'Mozilla/4.0'})
  164.  
  165.         exception = None
  166.         # retry when get (<class 'socket.error'>, error(54, 'Connection reset by peer')
  167.         for i in range(self._RETRY_TIMES):
  168.             try:
  169.                 response = self._opener.open(request, timeout=self._TIMEOUT)
  170.                 response_content = response.read().decode('utf-8')
  171.                 if self._DEBUG:
  172.                     print('GET Response body:{}'.format(response_content))
  173.                 return response_content
  174.             except socket.error as e:
  175.                 if self._DEBUG:
  176.                     import threading
  177.                     print(threading.currentThread(), e)
  178.                 if 'Connection reset by peer' not in str(e):
  179.                     raise e
  180.                 exception = e
  181.                 time.sleep(0.0001)
  182.         raise exception
  183.    
  184.  
  185.     def _execute(self, tasks):
  186.         first_tasks = [next(tasks, None) for i in range(self._MIN_TASKS_FOR_CONCURRENT)]
  187.         tasks = (task for task in itertools.chain(first_tasks, tasks) if task)
  188.  
  189.         if not first_tasks[-1] or not self._executor:
  190.             for each in tasks:
  191.                 yield each()
  192.         else:
  193.             exception = None
  194.             for each in [self._executor.submit(t) for t in tasks]:
  195.                 if exception:
  196.                     each.cancel()
  197.                 else:
  198.                     exception = each.exception()
  199.                     if not exception:
  200.                         yield each.result()
  201.  
  202.             if exception:
  203.                 raise exception
  204.  
  205.  
  206.     def _basic_translate(self, text, target_language, source_language):
  207.         # assert _is_bytes(text)
  208.        
  209.         if not target_language:
  210.             raise Error('invalid target language')
  211.  
  212.         if not text.strip():
  213.             return tuple(u'' for i in range(len(self._writing))) , unicode(target_language)
  214.  
  215.         # Browser request for 'hello world' is:
  216.         # http://translate.google.com/translate_a/t?client=t&hl=en&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&multires=1&prev=conf&psl=en&ptl=en&otf=1&it=sel.2016&ssel=0&tsel=0&prev=enter&oc=3&ssel=0&tsel=0&sc=1&text=hello%20world
  217.        
  218.         # 2015-04: google had changed service, it is now:
  219.         # https://translate.google.com/translate_a/single?client=z&sl=en&tl=zh-CN&ie=UTF-8&oe=UTF-8&dt=t&dt=rm&q=hello%20world
  220.         # dt=t: translate
  221.         # dt=rm: romanlized writing, like Chinese Pinyin
  222.  
  223.         # TODO: we could randomly choose one of the google domain URLs for concurrent support
  224.         GOOGLE_TRASLATE_URL = urljoin(random.choice(self._service_urls), '/translate_a/single')
  225.         GOOGLE_TRASLATE_PARAMETERS = {
  226.             'client': 'a',
  227.             'sl': source_language,
  228.             'tl': target_language,
  229.             'ie': 'UTF-8',
  230.             'oe': 'UTF-8',
  231.             'dt': 't',
  232.             'q': text,
  233.             }
  234.  
  235.         url = '?'.join((GOOGLE_TRASLATE_URL, urlencode(GOOGLE_TRASLATE_PARAMETERS)))
  236.         if 'translit' in self._writing:
  237.             url += '&dt=rm'
  238.        
  239.         response_content = self._open_url(url)
  240.         raw_data = json.loads(_empty_comma.subn('', response_content)[0].replace(u'\xA0', u' ').replace('[,', '[1,'))
  241.         data = {'src': raw_data[-1][0][0]}
  242.        
  243.         if raw_data[0][-1][0] == 1: # roman writing
  244.             data['translit'] = raw_data[0][-1][1]
  245.             data['trans'] = u''.join(i[0] for i in raw_data[0][:-1])
  246.         else:
  247.             data['translit'] = u''
  248.             data['trans'] = u''.join(i[0] for i in raw_data[0])
  249.            
  250.         translation = tuple(data[part] for part in self._writing)
  251.        
  252.         detected_source_language = data['src']
  253.         return translation, detected_source_language
  254.  
  255.  
  256.     def get_languages(self):
  257.         '''Discover supported languages
  258.  
  259.        It returns iso639-1 language codes for
  260.        `supported languages <https://developers.google.com/translate/v2/using_rest#language-params>`_
  261.        for translation. Some language codes also include a country code, like zh-CN or zh-TW.
  262.  
  263.        .. note:: It only queries Google once for the first time and use cached result afterwards
  264.  
  265.        :returns: a dict of all supported language code and language name mapping ``{'language-code', 'Language name'}``
  266.  
  267.        :Example:
  268.  
  269.        >>> languages = Googletrans().get_languages()
  270.        >>> assert 'zh' in languages
  271.        >>> print(languages['zh'])
  272.        Chinese
  273.  
  274.        '''
  275.         if self._languages:
  276.             return self._languages
  277.  
  278.         GOOGLE_TRASLATOR_URL = 'http://translate.google.com/translate_a/l'
  279.         GOOGLE_TRASLATOR_PARAMETERS = {
  280.             'client': 't',
  281.             }
  282.  
  283.         url = '?'.join((GOOGLE_TRASLATOR_URL, urlencode(GOOGLE_TRASLATOR_PARAMETERS)))
  284.         response_content = self._open_url(url)
  285.         data = json.loads(response_content)
  286.  
  287.         languages = data['sl']
  288.         languages.update(data['tl'])
  289.         if 'auto' in languages:
  290.             del languages['auto']
  291.         if 'zh' not in languages:
  292.             languages['zh'] = 'Chinese'
  293.         self._languages = languages
  294.         return self._languages
  295.  
  296.  
  297.     _SEPERATORS = [quote_plus(i.encode('utf-8')) for i in
  298.                    u'.!?,;。,?!::"“”’‘#$%&()()*×+/<=>@#¥[\]…[]^`{|}{}~~\n\r\t ']
  299.  
  300.     def _translate_single_text(self, text, target_language, source_lauguage):
  301.         assert _is_bytes(text)
  302.         def split_text(text):
  303.             start = 0
  304.             text = quote_plus(text)
  305.             length = len(text)
  306.             while (length - start) > self._MAX_LENGTH_PER_QUERY:
  307.                 for seperator in self._SEPERATORS:
  308.                     index = text.rfind(seperator, start, start+self._MAX_LENGTH_PER_QUERY)
  309.                     if index != -1:
  310.                         break
  311.                 else:
  312.                     raise Error('input too large')
  313.                 end = index + len(seperator)
  314.                 yield unquote_plus(text[start:end])
  315.                 start = end
  316.  
  317.             yield unquote_plus(text[start:])
  318.  
  319.         def make_task(text):
  320.             return lambda: self._basic_translate(text, target_language, source_lauguage)[0]
  321.  
  322.         results = list(self._execute(make_task(i) for i in split_text(text)))
  323.         return tuple(''.join(i[n] for i in results) for n in range(len(self._writing)))
  324.  
  325.  
  326.     def translate(self, text, target_language, source_language='auto'):
  327.         '''Translate text from source language to target language
  328.  
  329.        .. note::
  330.        
  331.         - Input all source strings at once. Googletrans will batch and fetch concurrently for maximize speed.
  332.         - `futures <https://pypi.python.org/pypi/futures>`_ is required for best performance.
  333.         - It returns generator on batch input in order to better fit pipeline architecture
  334.  
  335.        :param text: The source text(s) to be translated. Batch translation is supported via sequence input
  336.        :type text: UTF-8 str; unicode; string sequence (list, tuple, iterator, generator)
  337.  
  338.        :param target_language: The language to translate the source text into.
  339.         The value should be one of the language codes listed in :func:`get_languages`
  340.        :type target_language: str; unicode
  341.  
  342.        :param source_language: The language of the source text.
  343.                                The value should be one of the language codes listed in :func:`get_languages`.
  344.                                If a language is not specified,
  345.                                the system will attempt to identify the source language automatically.
  346.        :type source_language: str; unicode
  347.        
  348.        :returns: the translated text(s)
  349.        
  350.          - unicode: on single string input
  351.          - generator of unicode: on batch input of string sequence
  352.          - tuple: if WRITING_NATIVE_AND_ROMAN is specified, it will return tuple/generator for tuple (u"native", u"roman format")
  353.  
  354.        :raises:
  355.         - :class:`Error` ('invalid target language') if target language is not set
  356.         - :class:`Error` ('input too large') if input a single large word without any punctuation or space in between
  357.  
  358.  
  359.        :Example:
  360.        
  361.         >>> gs = Googletrans()
  362.         >>> print(gs.translate('Hello World', 'de'))
  363.         Hallo Welt
  364.         >>>
  365.         >>> for i in gs.translate(['good', u'morning'], 'de'):
  366.         ...     print(i)
  367.         ...
  368.         gut aus
  369.         Morgen
  370.  
  371.        To output romanlized translation
  372.  
  373.        :Example:
  374.        
  375.         >>> gs_roman = Googletrans(WRITING_ROMAN)
  376.         >>> print(gs_roman.translate('Hello', 'zh'))
  377.         Nín hǎo
  378.        
  379.        '''
  380.  
  381.  
  382.         if not target_language:
  383.             raise Error('invalid target language')
  384.  
  385.         if not source_language:
  386.             source_language = 'auto'
  387.        
  388.         if target_language.lower() == 'zh':
  389.             target_language = 'zh-CN'
  390.            
  391.         if source_language.lower() == 'zh':
  392.             source_language = 'zh-CN'
  393.            
  394.         if not _is_sequence(text):
  395.             if isinstance(text, unicode):
  396.                 text = text.encode('utf-8')
  397.             return _unwrapper_single_element(self._translate_single_text(text, target_language, source_language))
  398.  
  399.         JOINT = u'\u26ff'
  400.         UTF8_JOINT = (u'\n%s\n' % JOINT).encode('utf-8')
  401.  
  402.         def join_texts(texts):
  403.             def convert_to_utf8(texts):
  404.                 for i in texts:
  405.                     if isinstance(i, unicode):
  406.                         i = i.encode('utf-8')
  407.                     yield i.strip()
  408.                
  409.             texts = convert_to_utf8(texts)
  410.             text = next(texts)
  411.             for i in texts:
  412.                 new_text = UTF8_JOINT.join((text, i))
  413.                 if len(quote_plus(new_text)) < self._MAX_LENGTH_PER_QUERY:
  414.                     text = new_text
  415.                 else:
  416.                     yield text
  417.                     text = i
  418.             yield text
  419.  
  420.  
  421.         def make_task(text):
  422.             def task():
  423.                 r = self._translate_single_text(text, target_language, source_language)
  424.                 r = tuple([i.strip('\n') for i in n.split(JOINT)] for n in r)
  425.                 return izip(*r)
  426.                 # return r[0]
  427.             return task
  428.                
  429.         return (_unwrapper_single_element(i) for i in
  430.                 itertools.chain.from_iterable(self._execute(make_task(i) for i in join_texts(text))))
  431.  
  432.  
  433.     def _detect_language(self, text):
  434.         if _is_bytes(text):
  435.             text = text.decode('utf-8')
  436.         return self._basic_translate(text[:50].encode('utf-8'), 'en', 'auto')[1]
  437.  
  438.  
  439.     def detect(self, text):
  440.         '''Detect language of the input text
  441.  
  442.        .. note::
  443.        
  444.         - Input all source strings at once. Googletrans will detect concurrently for maximize speed.
  445.         - `futures <https://pypi.python.org/pypi/futures>`_ is required for best performance.
  446.         - It returns generator on batch input in order to better fit pipeline architecture.
  447.  
  448.        :param text: The source text(s) whose language you want to identify.
  449.                     Batch detection is supported via sequence input
  450.        :type text: UTF-8 str; unicode; sequence of string
  451.        :returns: the language code(s)
  452.        
  453.          - unicode: on single string input
  454.          - generator of unicode: on batch input of string sequence
  455.  
  456.        :raises: :class:`Error` if parameter type or value is not valid
  457.  
  458.        Example::
  459.        
  460.         >>> gs = Googletrans()
  461.         >>> print(gs.detect('hello world'))
  462.         en
  463.         >>> for i in gs.detect([u'hello', 'Hallo']):
  464.         ...     print(i)
  465.         ...
  466.         en
  467.         de
  468.  
  469.        '''
  470.         if _is_sequence(text):
  471.             return self._execute(functools.partial(self._detect_language, i) for i in text)
  472.         return self._detect_language(text)
  473.  
  474.  
  475. def _main(argv):
  476.     import optparse
  477.  
  478.     usage = "usage: %prog [options] <file1 file2 ...>\n<stdin> will be used as input source if no file specified."
  479.    
  480.     parser = optparse.OptionParser(usage=usage, version="%%prog %s @ Copyright %s" % (__version__, __copyright__))
  481.     parser.add_option('-t', '--target-language', metavar='zh-CN',
  482.                       help='specify target language to translate the source text into')
  483.     parser.add_option('-s', '--source-language', default='auto', metavar='en',
  484.                       help='specify source language, if not provide it will identify the source language automatically')
  485.     parser.add_option('-i', '--input-encoding', default=sys.getfilesystemencoding(), metavar='utf-8',
  486.                       help='specify input encoding, default to current console system encoding')
  487.     parser.add_option('-o', '--output-encoding', default=sys.getfilesystemencoding(), metavar='utf-8',
  488.                       help='specify output encoding, default to current console system encoding')
  489.     parser.add_option('-r', '--roman', action="store_true",
  490.                       help='change translation writing to roman (e.g.: output pinyin instead of Chinese charactors for Chinese. It only valid for some of the target languages)')
  491.  
  492.    
  493.     options, args = parser.parse_args(argv[1:])
  494.    
  495.     if not options.target_language:
  496.         print('Error: missing target language!')
  497.         parser.print_help()
  498.         return
  499.    
  500.     writing = WRITING_NATIVE
  501.     if options.roman:
  502.         writing = WRITING_ROMAN
  503.    
  504.     gs = Googletrans(writing=writing)
  505.     import fileinput
  506.     # inputs = fileinput.input(args, mode='rU', openhook=fileinput.hook_encoded(options.input_encoding))
  507.     inputs = fileinput.input(args, mode='rb')
  508.     inputs = (i.decode(options.input_encoding) for i in inputs)
  509.     outputs = gs.translate(inputs, options.target_language, options.source_language)
  510.     for i in outputs:
  511.         sys.stdout.write((i+u'\n').encode(options.output_encoding))
  512.         sys.stdout.flush()
  513.    
  514.    
  515. if __name__ == '__main__':
  516.     try:
  517.         _main(sys.argv)
  518.     except:
  519.         error = sys.exc_info()[1]
  520.         if len(str(error)) > 2:
  521.             print(error)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement