h8rt3rmin8r

googler.py

Oct 17th, 2018
179
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. #
  3. # Copyright © 2008 Henri Hakkinen
  4. # Copyright © 2015-2018 Arun Prakash Jana <engineerarun@gmail.com>
  5. #
  6. # This program is free software: you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation, either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18.  
  19. import argparse
  20. import atexit
  21. import base64
  22. import collections
  23. import codecs
  24. import functools
  25. import gzip
  26. import html.entities
  27. import html.parser
  28. import http.client
  29. from http.client import HTTPSConnection
  30. import locale
  31. import logging
  32. import os
  33. import shutil
  34. import signal
  35. import socket
  36. import ssl
  37. from subprocess import Popen, PIPE, DEVNULL
  38. import sys
  39. import textwrap
  40. import urllib.parse
  41. import webbrowser
  42.  
  43. # Python optional dependency compatibility layer
  44. try:
  45.     import readline
  46. except ImportError:
  47.     pass
  48.  
  49.  
  50. # Basic setup
  51.  
  52. try:
  53.     import setproctitle
  54.     setproctitle.setproctitle('googler')
  55. except Exception:
  56.     pass
  57.  
  58. logging.basicConfig(format='[%(levelname)s] %(message)s')
  59. logger = logging.getLogger()
  60.  
  61.  
  62. def sigint_handler(signum, frame):
  63.     print('\nInterrupted.', file=sys.stderr)
  64.     sys.exit(1)
  65.  
  66. signal.signal(signal.SIGINT, sigint_handler)
  67.  
  68.  
  69. # Constants
  70.  
  71. _VERSION_ = '3.7.1'
  72.  
  73. COLORMAP = {k: '\x1b[%sm' % v for k, v in {
  74.     'a': '30', 'b': '31', 'c': '32', 'd': '33',
  75.     'e': '34', 'f': '35', 'g': '36', 'h': '37',
  76.     'i': '90', 'j': '91', 'k': '92', 'l': '93',
  77.     'm': '94', 'n': '95', 'o': '96', 'p': '97',
  78.     'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
  79.     'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
  80.     'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
  81.     'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
  82.     'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
  83. }.items()}
  84.  
  85. USER_AGENT = ('googler/' + _VERSION_)
  86. ua = True  # User Agent is enabled by default
  87.  
  88. text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser']
  89.  
  90. # Self-upgrade parameters
  91. #
  92. # Downstream packagers are recommended to turn off the entire self-upgrade
  93. # mechanism through
  94. #
  95. #     make disable-self-upgrade
  96. #
  97. # before running `make install'.
  98.  
  99. ENABLE_SELF_UPGRADE_MECHANISM = True
  100. API_REPO_BASE = 'https://api.github.com/repos/jarun/googler'
  101. RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler'
  102.  
  103.  
  104. # Global helper functions
  105.  
  106. def open_url(url):
  107.     """Open an URL in the user's default web browser.
  108.  
  109.    The string attribute ``open_url.url_handler`` can be used to open URLs
  110.    in a custom CLI script or utility. A subprocess is spawned with url as
  111.    the parameter in this case instead of the usual webbrowser.open() call.
  112.  
  113.    Whether the browser's output (both stdout and stderr) are suppressed
  114.    depends on the boolean attribute ``open_url.suppress_browser_output``.
  115.    If the attribute is not set upon a call, set it to a default value,
  116.    which means False if BROWSER is set to a known text-based browser --
  117.    elinks, links, lynx, w3m or 'www-browser'; or True otherwise.
  118.  
  119.    The string attribute ``open_url.override_text_browser`` can be used to
  120.    ignore env var BROWSER as well as some known text-based browsers and
  121.    attempt to open url in a GUI browser available.
  122.    Note: If a GUI browser is indeed found, this option ignores the program
  123.          option `show-browser-logs`
  124.    """
  125.     logger.debug('Opening %s', url)
  126.  
  127.     # Custom URL handler gets max priority
  128.     if hasattr(open_url, 'url_handler'):
  129.         p = Popen([open_url.url_handler, url], stdin=PIPE)
  130.         p.communicate()
  131.         return
  132.  
  133.     browser = webbrowser.get()
  134.     if open_url.override_text_browser:
  135.         browser_output = open_url.suppress_browser_output
  136.         for name in [b for b in webbrowser._tryorder if b not in text_browsers]:
  137.             browser = webbrowser.get(name)
  138.             logger.debug(browser)
  139.  
  140.             # Found a GUI browser, suppress browser output
  141.             open_url.suppress_browser_output = True
  142.             break
  143.  
  144.     if open_url.suppress_browser_output:
  145.         _stderr = os.dup(2)
  146.         os.close(2)
  147.         _stdout = os.dup(1)
  148.         os.close(1)
  149.         fd = os.open(os.devnull, os.O_RDWR)
  150.         os.dup2(fd, 2)
  151.         os.dup2(fd, 1)
  152.     try:
  153.         browser.open(url, new=2)
  154.     finally:
  155.         if open_url.suppress_browser_output:
  156.             os.close(fd)
  157.             os.dup2(_stderr, 2)
  158.             os.dup2(_stdout, 1)
  159.  
  160.     if open_url.override_text_browser:
  161.         open_url.suppress_browser_output = browser_output
  162.  
  163.  
  164. def printerr(msg):
  165.     """Print message, verbatim, to stderr.
  166.  
  167.    ``msg`` could be any stringifiable value.
  168.    """
  169.     print(msg, file=sys.stderr)
  170.  
  171.  
  172. def unwrap(text):
  173.     """Unwrap text."""
  174.     lines = text.split('\n')
  175.     result = ''
  176.     for i in range(len(lines) - 1):
  177.         result += lines[i]
  178.         if not lines[i]:
  179.             # Paragraph break
  180.             result += '\n\n'
  181.         elif lines[i + 1]:
  182.             # Next line is not paragraph break, add space
  183.             result += ' '
  184.     # Handle last line
  185.     result += lines[-1] if lines[-1] else '\n'
  186.     return result
  187.  
  188.  
  189. def check_stdout_encoding():
  190.     """Make sure stdout encoding is utf-8.
  191.  
  192.    If not, print error message and instructions, then exit with
  193.    status 1.
  194.  
  195.    This function is a no-op on win32 because encoding on win32 is
  196.    messy, and let's just hope for the best. /s
  197.    """
  198.     if sys.platform == 'win32':
  199.         return
  200.  
  201.     # Use codecs.lookup to resolve text encoding alias
  202.     encoding = codecs.lookup(sys.stdout.encoding).name
  203.     if encoding != 'utf-8':
  204.         locale_lang, locale_encoding = locale.getlocale()
  205.         if locale_lang is None:
  206.             locale_lang = '<unknown>'
  207.         if locale_encoding is None:
  208.             locale_encoding = '<unknown>'
  209.         ioencoding = os.getenv('PYTHONIOENCODING', 'not set')
  210.         sys.stderr.write(unwrap(textwrap.dedent("""\
  211.        stdout encoding '{encoding}' detected. googler requires utf-8 to
  212.        work properly. The wrong encoding may be due to a non-UTF-8
  213.        locale or an improper PYTHONIOENCODING. (For the record, your
  214.        locale language is {locale_lang} and locale encoding is
  215.        {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.)
  216.  
  217.        Please set a UTF-8 locale (e.g., en_US.UTF-8) or set
  218.        PYTHONIOENCODING to utf-8.
  219.        """.format(
  220.             encoding=encoding,
  221.             locale_lang=locale_lang,
  222.             locale_encoding=locale_encoding,
  223.             ioencoding=ioencoding,
  224.         ))))
  225.         sys.exit(1)
  226.  
  227.  
  228. # Classes
  229.  
  230. class TLS1_2Connection(HTTPSConnection):
  231.     """Overrides HTTPSConnection.connect to specify TLS version
  232.  
  233.    NOTE: TLS 1.2 is supported from Python 3.4
  234.    """
  235.  
  236.     def __init__(self, host, **kwargs):
  237.         HTTPSConnection.__init__(self, host, **kwargs)
  238.  
  239.     def connect(self, notweak=False):
  240.         sock = socket.create_connection((self.host, self.port),
  241.                                         self.timeout, self.source_address)
  242.  
  243.         # Optimizations not available on OS X
  244.         if not notweak and sys.platform.startswith('linux'):
  245.             try:
  246.                 sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
  247.                 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
  248.                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
  249.             except OSError:
  250.                 # Doesn't work on Windows' Linux subsystem (#179)
  251.                 logger.debug('setsockopt failed')
  252.  
  253.         if getattr(self, '_tunnel_host', None):
  254.             self.sock = sock
  255.         elif not notweak:
  256.             # Try to use TLS 1.2
  257.             ssl_context = None
  258.             if hasattr(ssl, 'PROTOCOL_TLS'):
  259.                 # Since Python 3.5.3
  260.                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
  261.                 ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
  262.                                         ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
  263.             elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
  264.                 # Since Python 3.4
  265.                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
  266.             if ssl_context:
  267.                 self.sock = ssl_context.wrap_socket(sock)
  268.                 return
  269.  
  270.         # Fallback
  271.         HTTPSConnection.connect(self)
  272.  
  273.  
  274. class GoogleUrl(object):
  275.     """
  276.    This class constructs the Google Search/News URL.
  277.  
  278.    This class is modelled on urllib.parse.ParseResult for familiarity,
  279.    which means it supports reading of all six attributes -- scheme,
  280.    netloc, path, params, query, fragment -- of
  281.    urllib.parse.ParseResult, as well as the geturl() method.
  282.  
  283.    However, the attributes (properties) and methods listed below should
  284.    be the preferred methods of access to this class.
  285.  
  286.    Parameters
  287.    ----------
  288.    opts : dict or argparse.Namespace, optional
  289.        See the ``opts`` parameter of `update`.
  290.  
  291.    Other Parameters
  292.    ----------------
  293.    See "Other Parameters" of `update`.
  294.  
  295.    Attributes
  296.    ----------
  297.    hostname : str
  298.        Read-write property.
  299.    keywords : str or list of strs
  300.        Read-write property.
  301.    news : bool
  302.        Read-only property.
  303.    url : str
  304.        Read-only property.
  305.  
  306.    Methods
  307.    -------
  308.    full()
  309.    relative()
  310.    update(opts=None, **kwargs)
  311.    set_queries(**kwargs)
  312.    unset_queries(*args)
  313.    next_page()
  314.    prev_page()
  315.    first_page()
  316.  
  317.    """
  318.  
  319.     def __init__(self, opts=None, **kwargs):
  320.         self.scheme = 'https'
  321.         # self.netloc is a calculated property
  322.         self.path = '/search'
  323.         self.params = ''
  324.         # self.query is a calculated property
  325.         self.fragment = ''
  326.  
  327.         self._tld = None
  328.         self._num = 10
  329.         self._start = 0
  330.         self._keywords = []
  331.         self._sites = None
  332.         self._query_dict = {
  333.             'ie': 'UTF-8',
  334.             'oe': 'UTF-8',
  335.         }
  336.         self.update(opts, **kwargs)
  337.  
  338.     def __str__(self):
  339.         return self.url
  340.  
  341.     @property
  342.     def url(self):
  343.         """The full Google URL you want."""
  344.         return self.full()
  345.  
  346.     @property
  347.     def hostname(self):
  348.         """The hostname."""
  349.         return self.netloc
  350.  
  351.     @hostname.setter
  352.     def hostname(self, hostname):
  353.         self.netloc = hostname
  354.  
  355.     @property
  356.     def keywords(self):
  357.         """The keywords, either a str or a list of strs."""
  358.         return self._keywords
  359.  
  360.     @keywords.setter
  361.     def keywords(self, keywords):
  362.         self._keywords = keywords
  363.  
  364.     @property
  365.     def news(self):
  366.         """Whether the URL is for Google News."""
  367.         return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws'
  368.  
  369.     def full(self):
  370.         """Return the full URL.
  371.  
  372.        Returns
  373.        -------
  374.        str
  375.  
  376.        """
  377.         url = (self.scheme + ':') if self.scheme else ''
  378.         url += '//' + self.netloc + self.relative()
  379.         return url
  380.  
  381.     def relative(self):
  382.         """Return the relative URL (without scheme and authority).
  383.  
  384.        Authority (see RFC 3986 section 3.2), or netloc in the
  385.        terminology of urllib.parse, basically means the hostname
  386.        here. The relative URL is good for making HTTP(S) requests to a
  387.        known host.
  388.  
  389.        Returns
  390.        -------
  391.        str
  392.  
  393.        """
  394.         rel = self.path
  395.         if self.params:
  396.             rel += ';' + self.params
  397.         if self.query:
  398.             rel += '?' + self.query
  399.         if self.fragment:
  400.             rel += '#' + self.fragment
  401.         return rel
  402.  
  403.     def update(self, opts=None, **kwargs):
  404.         """Update the URL with the given options.
  405.  
  406.        Parameters
  407.        ----------
  408.        opts : dict or argparse.Namespace, optional
  409.            Carries options that affect the Google Search/News URL. The
  410.            list of currently recognized option keys with expected value
  411.            types:
  412.  
  413.                duration: str (GooglerArgumentParser.is_duration)
  414.                exact: bool
  415.                keywords: str or list of strs
  416.                lang: str
  417.                news: bool
  418.                num: int
  419.                site: str
  420.                start: int
  421.                tld: str
  422.                unfilter: bool
  423.  
  424.        Other Parameters
  425.        ----------------
  426.        kwargs
  427.            The `kwargs` dict extends `opts`, that is, options can be
  428.            specified either way, in `opts` or as individual keyword
  429.            arguments.
  430.  
  431.        """
  432.  
  433.         if opts is None:
  434.             opts = {}
  435.         if hasattr(opts, '__dict__'):
  436.             opts = opts.__dict__
  437.         opts.update(kwargs)
  438.  
  439.         qd = self._query_dict
  440.         if 'duration' in opts and opts['duration']:
  441.             qd['tbs'] = 'qdr:%s' % opts['duration']
  442.         if 'exact' in opts:
  443.             if opts['exact']:
  444.                 qd['nfpr'] = 1
  445.             else:
  446.                 qd.pop('nfpr', None)
  447.         if 'keywords' in opts:
  448.             self._keywords = opts['keywords']
  449.         if 'lang' in opts and opts['lang']:
  450.             qd['hl'] = opts['lang']
  451.         if 'news' in opts:
  452.             if opts['news']:
  453.                 qd['tbm'] = 'nws'
  454.             else:
  455.                 qd.pop('tbm', None)
  456.         if 'num' in opts:
  457.             self._num = opts['num']
  458.         if 'sites' in opts:
  459.             self._sites = opts['sites']
  460.         if 'start' in opts:
  461.             self._start = opts['start']
  462.         if 'tld' in opts:
  463.             self._tld = opts['tld']
  464.         if 'unfilter' in opts and opts['unfilter']:
  465.             qd['filter'] = 0
  466.  
  467.     def set_queries(self, **kwargs):
  468.         """Forcefully set queries outside the normal `update` mechanism.
  469.  
  470.        Other Parameters
  471.        ----------------
  472.        kwargs
  473.            Arbitrary key value pairs to be set in the query string. All
  474.            keys and values should be stringifiable.
  475.  
  476.            Note that certain keys, e.g., ``q``, have their values
  477.            constructed on the fly, so setting those has no actual
  478.            effect.
  479.  
  480.        """
  481.         for k, v in kwargs.items():
  482.             self._query_dict[k] = v
  483.  
  484.     def unset_queries(self, *args):
  485.         """Forcefully unset queries outside the normal `update` mechanism.
  486.  
  487.        Other Parameters
  488.        ----------------
  489.        args
  490.            Arbitrary keys to be unset. No exception is raised if a key
  491.            does not exist in the first place.
  492.  
  493.            Note that certain keys, e.g., ``q``, are always included in
  494.            the resulting URL, so unsetting those has no actual effect.
  495.  
  496.        """
  497.         for k in args:
  498.             self._query_dict.pop(k, None)
  499.  
  500.     def next_page(self):
  501.         """Navigate to the next page."""
  502.         self._start += self._num
  503.  
  504.     def prev_page(self):
  505.         """Navigate to the previous page.
  506.  
  507.        Raises
  508.        ------
  509.        ValueError
  510.            If already at the first page (``start=0`` in the current
  511.            query string).
  512.  
  513.        """
  514.         if self._start == 0:
  515.             raise ValueError('Already at the first page.')
  516.         self._start = (self._start - self._num) if self._start > self._num else 0
  517.  
  518.     def first_page(self):
  519.         """Navigate to the first page.
  520.  
  521.        Raises
  522.        ------
  523.        ValueError
  524.            If already at the first page (``start=0`` in the current
  525.            query string).
  526.  
  527.        """
  528.         if self._start == 0:
  529.             raise ValueError('Already at the first page.')
  530.         self._start = 0
  531.  
  532.     # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains
  533.     # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
  534.     TLD_TO_DOMAIN_MAP = {
  535.         'ac': 'google.ac',      'ad': 'google.ad',      'ae': 'google.ae',
  536.         'af': 'google.com.af',  'ag': 'google.com.ag',  'ai': 'google.com.ai',
  537.         'al': 'google.al',      'am': 'google.am',      'ao': 'google.co.ao',
  538.         'ar': 'google.com.ar',  'as': 'google.as',      'at': 'google.at',
  539.         'au': 'google.com.au',  'az': 'google.az',      'ba': 'google.ba',
  540.         'bd': 'google.com.bd',  'be': 'google.be',      'bf': 'google.bf',
  541.         'bg': 'google.bg',      'bh': 'google.com.bh',  'bi': 'google.bi',
  542.         'bj': 'google.bj',      'bn': 'google.com.bn',  'bo': 'google.com.bo',
  543.         'br': 'google.com.br',  'bs': 'google.bs',      'bt': 'google.bt',
  544.         'bw': 'google.co.bw',   'by': 'google.by',      'bz': 'google.com.bz',
  545.         'ca': 'google.ca',      'cat': 'google.cat',    'cc': 'google.cc',
  546.         'cd': 'google.cd',      'cf': 'google.cf',      'cg': 'google.cg',
  547.         'ch': 'google.ch',      'ci': 'google.ci',      'ck': 'google.co.ck',
  548.         'cl': 'google.cl',      'cm': 'google.cm',      'cn': 'google.cn',
  549.         'co': 'google.com.co',  'cr': 'google.co.cr',   'cu': 'google.com.cu',
  550.         'cv': 'google.cv',      'cy': 'google.com.cy',  'cz': 'google.cz',
  551.         'de': 'google.de',      'dj': 'google.dj',      'dk': 'google.dk',
  552.         'dm': 'google.dm',      'do': 'google.com.do',  'dz': 'google.dz',
  553.         'ec': 'google.com.ec',  'ee': 'google.ee',      'eg': 'google.com.eg',
  554.         'es': 'google.es',      'et': 'google.com.et',  'fi': 'google.fi',
  555.         'fj': 'google.com.fj',  'fm': 'google.fm',      'fr': 'google.fr',
  556.         'ga': 'google.ga',      'ge': 'google.ge',      'gf': 'google.gf',
  557.         'gg': 'google.gg',      'gh': 'google.com.gh',  'gi': 'google.com.gi',
  558.         'gl': 'google.gl',      'gm': 'google.gm',      'gp': 'google.gp',
  559.         'gr': 'google.gr',      'gt': 'google.com.gt',  'gy': 'google.gy',
  560.         'hk': 'google.com.hk',  'hn': 'google.hn',      'hr': 'google.hr',
  561.         'ht': 'google.ht',      'hu': 'google.hu',      'id': 'google.co.id',
  562.         'ie': 'google.ie',      'il': 'google.co.il',   'im': 'google.im',
  563.         'in': 'google.co.in',   'io': 'google.io',      'iq': 'google.iq',
  564.         'is': 'google.is',      'it': 'google.it',      'je': 'google.je',
  565.         'jm': 'google.com.jm',  'jo': 'google.jo',      'jp': 'google.co.jp',
  566.         'ke': 'google.co.ke',   'kg': 'google.kg',      'kh': 'google.com.kh',
  567.         'ki': 'google.ki',      'kr': 'google.co.kr',   'kw': 'google.com.kw',
  568.         'kz': 'google.kz',      'la': 'google.la',      'lb': 'google.com.lb',
  569.         'lc': 'google.com.lc',  'li': 'google.li',      'lk': 'google.lk',
  570.         'ls': 'google.co.ls',   'lt': 'google.lt',      'lu': 'google.lu',
  571.         'lv': 'google.lv',      'ly': 'google.com.ly',  'ma': 'google.co.ma',
  572.         'md': 'google.md',      'me': 'google.me',      'mg': 'google.mg',
  573.         'mk': 'google.mk',      'ml': 'google.ml',      'mm': 'google.com.mm',
  574.         'mn': 'google.mn',      'ms': 'google.ms',      'mt': 'google.com.mt',
  575.         'mu': 'google.mu',      'mv': 'google.mv',      'mw': 'google.mw',
  576.         'mx': 'google.com.mx',  'my': 'google.com.my',  'mz': 'google.co.mz',
  577.         'na': 'google.com.na',  'ne': 'google.ne',      'nf': 'google.com.nf',
  578.         'ng': 'google.com.ng',  'ni': 'google.com.ni',  'nl': 'google.nl',
  579.         'no': 'google.no',      'np': 'google.com.np',  'nr': 'google.nr',
  580.         'nu': 'google.nu',      'nz': 'google.co.nz',   'om': 'google.com.om',
  581.         'pa': 'google.com.pa',  'pe': 'google.com.pe',  'pg': 'google.com.pg',
  582.         'ph': 'google.com.ph',  'pk': 'google.com.pk',  'pl': 'google.pl',
  583.         'pn': 'google.co.pn',   'pr': 'google.com.pr',  'ps': 'google.ps',
  584.         'pt': 'google.pt',      'py': 'google.com.py',  'qa': 'google.com.qa',
  585.         'ro': 'google.ro',      'rs': 'google.rs',      'ru': 'google.ru',
  586.         'rw': 'google.rw',      'sa': 'google.com.sa',  'sb': 'google.com.sb',
  587.         'sc': 'google.sc',      'se': 'google.se',      'sg': 'google.com.sg',
  588.         'sh': 'google.sh',      'si': 'google.si',      'sk': 'google.sk',
  589.         'sl': 'google.com.sl',  'sm': 'google.sm',      'sn': 'google.sn',
  590.         'so': 'google.so',      'sr': 'google.sr',      'st': 'google.st',
  591.         'sv': 'google.com.sv',  'td': 'google.td',      'tg': 'google.tg',
  592.         'th': 'google.co.th',   'tj': 'google.com.tj',  'tk': 'google.tk',
  593.         'tl': 'google.tl',      'tm': 'google.tm',      'tn': 'google.tn',
  594.         'to': 'google.to',      'tr': 'google.com.tr',  'tt': 'google.tt',
  595.         'tw': 'google.com.tw',  'tz': 'google.co.tz',   'ua': 'google.com.ua',
  596.         'ug': 'google.co.ug',   'uk': 'google.co.uk',   'uy': 'google.com.uy',
  597.         'uz': 'google.co.uz',   'vc': 'google.com.vc',  've': 'google.co.ve',
  598.         'vg': 'google.vg',      'vi': 'google.co.vi',   'vn': 'google.com.vn',
  599.         'vu': 'google.vu',      'ws': 'google.ws',      'za': 'google.co.za',
  600.         'zm': 'google.co.zm',   'zw': 'google.co.zw',
  601.     }
  602.  
  603.     @property
  604.     def netloc(self):
  605.         """The hostname."""
  606.         try:
  607.             return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld]
  608.         except KeyError:
  609.             return 'www.google.com'
  610.  
  611.     @property
  612.     def query(self):
  613.         """The query string."""
  614.         qd = {}
  615.         qd.update(self._query_dict)
  616.         if self._num != 10:  # Skip sending the default
  617.             qd['num'] = self._num
  618.         if self._start:  # Skip sending the default
  619.             qd['start'] = self._start
  620.  
  621.         # Construct the q query
  622.         q = ''
  623.         keywords = self._keywords
  624.         sites = self._sites
  625.         if keywords:
  626.             if isinstance(keywords, list):
  627.                 q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords)
  628.             else:
  629.                 q += urllib.parse.quote_plus(keywords)
  630.         if sites:
  631.             q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites)
  632.         qd['q'] = q
  633.  
  634.         return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys()))
  635.  
  636.  
  637. class GoogleConnectionError(Exception):
  638.     pass
  639.  
  640.  
  641. class GoogleConnection(object):
  642.     """
  643.    This class facilitates connecting to and fetching from Google.
  644.  
  645.    Parameters
  646.    ----------
  647.    See http.client.HTTPSConnection for documentation of the
  648.    parameters.
  649.  
  650.    Raises
  651.    ------
  652.    GoogleConnectionError
  653.  
  654.    Attributes
  655.    ----------
  656.    host : str
  657.        The currently connected host. Read-only property. Use
  658.        `new_connection` to change host.
  659.  
  660.    Methods
  661.    -------
  662.    new_connection(host=None, port=None, timeout=45)
  663.    renew_connection(timeout=45)
  664.    fetch_page(url)
  665.    close()
  666.  
  667.    """
  668.  
  669.     def __init__(self, host, port=None, timeout=45, proxy=None, notweak=False):
  670.         self._host = None
  671.         self._port = None
  672.         self._proxy = proxy
  673.         self._notweak = notweak
  674.         self._conn = None
  675.         self.new_connection(host, port=port, timeout=timeout)
  676.         self.cookie = ''
  677.  
  678.     @property
  679.     def host(self):
  680.         """The host currently connected to."""
  681.         return self._host
  682.  
  683.     def new_connection(self, host=None, port=None, timeout=45):
  684.         """Close the current connection (if any) and establish a new one.
  685.  
  686.        Parameters
  687.        ----------
  688.        See http.client.HTTPSConnection for documentation of the
  689.        parameters. Renew the connection (i.e., reuse the current host
  690.        and port) if host is None or empty.
  691.  
  692.        Raises
  693.        ------
  694.        GoogleConnectionError
  695.  
  696.        """
  697.         if self._conn:
  698.             self._conn.close()
  699.  
  700.         if not host:
  701.             host = self._host
  702.             port = self._port
  703.         self._host = host
  704.         self._port = port
  705.         host_display = host + (':%d' % port if port else '')
  706.  
  707.         proxy = self._proxy
  708.  
  709.         if proxy:
  710.             proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
  711.  
  712.             logger.debug('Connecting to proxy server %s', proxy_host_port)
  713.             self._conn = TLS1_2Connection(proxy_host_port, timeout=timeout)
  714.  
  715.             logger.debug('Tunnelling to host %s' % host_display)
  716.             connect_headers = {}
  717.             if proxy_user_passwd:
  718.                 connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode(
  719.                     proxy_user_passwd.encode('utf-8')
  720.                 ).decode('utf-8')
  721.             self._conn.set_tunnel(host, port=port, headers=connect_headers)
  722.  
  723.             try:
  724.                 self._conn.connect(self._notweak)
  725.             except Exception as e:
  726.                 msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
  727.                 raise GoogleConnectionError(msg)
  728.         else:
  729.             logger.debug('Connecting to new host %s', host_display)
  730.             self._conn = TLS1_2Connection(host, port=port, timeout=timeout)
  731.             try:
  732.                 self._conn.connect(self._notweak)
  733.             except Exception as e:
  734.                 msg = 'Failed to connect to %s: %s.' % (host_display, e)
  735.                 raise GoogleConnectionError(msg)
  736.  
  737.     def renew_connection(self, timeout=45):
  738.         """Renew current connection.
  739.  
  740.        Equivalent to ``new_connection(timeout=timeout)``.
  741.  
  742.        """
  743.         self.new_connection(timeout=timeout)
  744.  
  745.     def fetch_page(self, url):
  746.         """Fetch a URL.
  747.  
  748.        Allows one reconnection and multiple redirections before failing
  749.        and raising GoogleConnectionError.
  750.  
  751.        Parameters
  752.        ----------
  753.        url : str
  754.            The URL to fetch, relative to the host.
  755.  
  756.        Raises
  757.        ------
  758.        GoogleConnectionError
  759.            When not getting HTTP 200 even after the allowed one
  760.            reconnection and/or one redirection, or when Google is
  761.            blocking query due to unusual activity.
  762.  
  763.        Returns
  764.        -------
  765.        str
  766.            Response payload, gunzipped (if applicable) and decoded (in UTF-8).
  767.  
  768.        """
  769.         try:
  770.             self._raw_get(url)
  771.         except (http.client.HTTPException, OSError) as e:
  772.             logger.debug('Got exception: %s.', e)
  773.             logger.debug('Attempting to reconnect...')
  774.             self.renew_connection()
  775.             try:
  776.                 self._raw_get(url)
  777.             except http.client.HTTPException as e:
  778.                 logger.debug('Got exception: %s.', e)
  779.                 raise GoogleConnectionError("Failed to get '%s'." % url)
  780.  
  781.         resp = self._resp
  782.         redirect_counter = 0
  783.         while resp.status != 200 and redirect_counter < 3:
  784.             if resp.status in {301, 302, 303, 307, 308}:
  785.                 redirection_url = resp.getheader('location', '')
  786.                 if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url:
  787.                     raise GoogleConnectionError('Connection blocked due to unusual activity.')
  788.                 self._redirect(redirection_url)
  789.                 resp = self._resp
  790.                 redirect_counter += 1
  791.             else:
  792.                 break
  793.  
  794.         if resp.status != 200:
  795.             raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason))
  796.  
  797.         payload = resp.read()
  798.         try:
  799.             return gzip.decompress(payload).decode('utf-8')
  800.         except OSError:
  801.             # Not gzipped
  802.             return payload.decode('utf-8')
  803.  
  804.     def _redirect(self, url):
  805.         """Redirect to and fetch a new URL.
  806.  
  807.        Like `_raw_get`, the response is stored in ``self._resp``. A new
  808.        connection is made if redirecting to a different host.
  809.  
  810.        Parameters
  811.        ----------
  812.        url : str
  813.            If absolute and points to a different host, make a new
  814.            connection.
  815.  
  816.        Raises
  817.        ------
  818.        GoogleConnectionError
  819.  
  820.        """
  821.         logger.debug('Redirecting to URL %s', url)
  822.         segments = urllib.parse.urlparse(url)
  823.  
  824.         host = segments.netloc
  825.         if host != self._host:
  826.             self.new_connection(host)
  827.  
  828.         relurl = urllib.parse.urlunparse(('', '') + segments[2:])
  829.         try:
  830.             self._raw_get(relurl)
  831.         except http.client.HTTPException as e:
  832.             logger.debug('Got exception: %s.', e)
  833.             raise GoogleConnectionError("Failed to get '%s'." % url)
  834.  
  835.     def _raw_get(self, url):
  836.         """Make a raw HTTP GET request.
  837.  
  838.        No status check (which implies no redirection). Response can be
  839.        accessed from ``self._resp``.
  840.  
  841.        Parameters
  842.        ----------
  843.        url : str
  844.            URL relative to the host, used in the GET request.
  845.  
  846.        Raises
  847.        ------
  848.        http.client.HTTPException
  849.  
  850.        """
  851.         logger.debug('Fetching URL %s', url)
  852.         self._conn.request('GET', url, None, {
  853.             'Accept-Encoding': 'gzip',
  854.             'User-Agent': USER_AGENT if ua else '',
  855.             'Cookie': self.cookie,
  856.             'Connection': 'keep-alive',
  857.             'DNT': '1',
  858.         })
  859.         self._resp = self._conn.getresponse()
  860.         if self.cookie == '':
  861.             complete_cookie = self._resp.getheader('Set-Cookie')
  862.             # Cookie won't be available is already blocked
  863.             if complete_cookie is not None:
  864.                 self.cookie = complete_cookie[:complete_cookie.find(';')]
  865.                 logger.debug('Cookie: %s' % self.cookie)
  866.  
  867.     def close(self):
  868.         """Close the connection (if one is active)."""
  869.         if self._conn:
  870.             self._conn.close()
  871.  
  872.  
  873. def annotate_tag(annotated_starttag_handler):
  874.     # See parser logic within the GoogleParser class for documentation.
  875.     #
  876.     # In particular, search for "Ignore List" to view detailed
  877.     # documentation of the ignore list.
  878.     #
  879.     # annotated_starttag_handler(self, tag: str, attrsdict: dict) -> annotation
  880.     # Returns: HTMLParser.handle_starttag(self, tag: str, attrs: list) -> None
  881.  
  882.     def handler(self, tag, attrs):
  883.         # Get context; assumes that the handler is called SCOPE_start
  884.         context = annotated_starttag_handler.__name__[:-6]
  885.  
  886.         # If context is 'ignore', ignore all tests
  887.         if context == 'ignore':
  888.             self.insert_annotation(tag, None)
  889.             return
  890.  
  891.         attrs = dict(attrs)
  892.  
  893.         # Compare against ignore list
  894.         ignored = False
  895.         for selector in self.IGNORE_LIST:
  896.             for attr in selector:
  897.                 if attr == 'tag':
  898.                     if tag != selector['tag']:
  899.                         break
  900.                 elif attr == 'class':
  901.                     tag_classes = set(self.classes(attrs))
  902.                     selector_classes = set(self.classes(selector))
  903.                     if not selector_classes.issubset(tag_classes):
  904.                         break
  905.                 else:
  906.                     if attrs[attr] != selector[attr]:
  907.                         break
  908.             else:
  909.                 # Passed all criteria of the selector
  910.                 ignored = True
  911.                 break
  912.  
  913.         # If tag matches ignore list, annotate and hand over to ignore_*
  914.         if ignored:
  915.             self.insert_annotation(tag, context + '_ignored')
  916.             self.set_handlers_to('ignore')
  917.             return
  918.  
  919.         # Standard
  920.         annotation = annotated_starttag_handler(self, tag, attrs)
  921.         self.insert_annotation(tag, annotation)
  922.  
  923.     return handler
  924.  
  925.  
  926. def retrieve_tag_annotation(annotated_endtag_handler):
  927.     # See parser logic within the GoogleParser class for documentation.
  928.     #
  929.     # annotated_endtag_handler(self, tag: str, annotation) -> None
  930.     # Returns: HTMLParser.handle_endtag(self, tag: str) -> None
  931.  
  932.     def handler(self, tag):
  933.         try:
  934.             annotation = self.tag_annotations[tag].pop()
  935.         except IndexError:
  936.             # Malformed HTML -- more close tags than open tags
  937.             annotation = None
  938.         annotated_endtag_handler(self, tag, annotation)
  939.  
  940.     return handler
  941.  
  942.  
  943. class GoogleParser(html.parser.HTMLParser):
  944.     """The members of this class parse the result
  945.    HTML page fetched from Google server for a query.
  946.  
  947.    The custom parser looks for tags enclosing search
  948.    results and extracts the URL, title and text for
  949.    each search result.
  950.  
  951.    After parsing the complete HTML page results are
  952.    returned in a list of objects of class Result.
  953.    """
  954.  
  955.     # Parser logic:
  956.     #
  957.     # - Guiding principles:
  958.     #
  959.     #   1. Tag handlers are contextual;
  960.     #
  961.     #   2. Contextual starttag and endtag handlers should come in pairs
  962.     #      and have a clear hierarchy;
  963.     #
  964.     #   3. starttag handlers should only yield control to a pair of
  965.     #      child handlers (that is, one level down the hierarchy), and
  966.     #      correspondingly, endtag handlers should only return control
  967.     #      to the parent (that is, the pair of handlers that gave it
  968.     #      control in the first place).
  969.     #
  970.     #   Principle 3 is meant to enforce a (possibly implicit) stack
  971.     #   structure and thus prevent careless jumps that result in what's
  972.     #   essentially spaghetti code with liberal use of GOTOs.
  973.     #
  974.     # - HTMLParser.handle_endtag gives us a bare tag name without
  975.     #   context, which is not good for enforcing principle 3 when we
  976.     #   have, say, nested div tags.
  977.     #
  978.     #   In order to precisely identify the matching opening tag, we
  979.     #   maintain a stack for each tag name with *annotations*. Important
  980.     #   opening tags (e.g., the ones where child handlers are
  981.     #   registered) can be annotated so that when we can watch for the
  982.     #   annotation in the endtag handler, and when the appropriate
  983.     #   annotation is popped, we perform the corresponding action (e.g.,
  984.     #   switch back to old handlers).
  985.     #
  986.     #   To facilitate this, each starttag handler is decorated with
  987.     #   @annotate_tag, which accepts a return value that is the
  988.     #   annotation (None by default), and additionally converts attrs to
  989.     #   a dict, which is much easier to work with; and each endtag
  990.     #   handler is decorated with @retrieve_tag_annotation which sends
  991.     #   an additional parameter that is the retrieved annotation to the
  992.     #   handler.
  993.     #
  994.     #   Note that some of our tag annotation stacks leak over time: this
  995.     #   happens to tags like <img> and <hr> which are not
  996.     #   closed. However, these tags play no structural role, and come
  997.     #   only in small quantities, so it's not really a problem.
  998.     #
  999.     # - All textual data (result title, result abstract, etc.) are
  1000.     #   processed through a set of shared handlers. These handlers store
  1001.     #   text in a shared buffer self.textbuf which can be retrieved and
  1002.     #   cleared at appropriate times.
  1003.     #
  1004.     #   Data (including charrefs and entityrefs) are ignored initially,
  1005.     #   and when data needs to be recorded, the start_populating_textbuf
  1006.     #   method is called to register the appropriate data, charref and
  1007.     #   entityref handlers so that they append to self.textbuf. When
  1008.     #   recording ends, pop_textbuf should be called to extract the text
  1009.     #   and clear the buffer. stop_populating_textbuf returns the
  1010.     #   handlers to their pristine state (ignoring data).
  1011.     #
  1012.     #   Methods:
  1013.     #   - start_populating_textbuf(self, data_transformer: Callable[[str], str]) -> None
  1014.     #   - pop_textbuf(self) -> str
  1015.     #   - stop_populating_textbuf(self) -> None
  1016.     #
  1017.     # - Outermost starttag and endtag handler methods: root_*. The whole
  1018.     #   parser starts and ends in this state.
  1019.     #
  1020.     # - Each result is wrapped in a <div> tag with class "g".
  1021.     #
  1022.     #   <!-- within the scope of root_* -->
  1023.     #   <div class="g">  <!-- annotate as 'result', hand over to result_* -->
  1024.     #   </div>           <!-- hand back to root_*, register result -->
  1025.     #
  1026.     # - For each result, the first <h3> tag with class "r" contains the
  1027.     #   hyperlinked title, and the (optional) first <div> tag with class
  1028.     #   "s" contains the abstract of the result.
  1029.     #
  1030.     #   <!-- within the scope of result_* -->
  1031.     #   <h3 class="r">   <!-- annotate as 'title', hand over to title_* -->
  1032.     #   </h3>            <!-- hand back to result_* -->
  1033.     #   <div class="s">  <!-- annotate as 'abstract', hand over to abstract_* -->
  1034.     #   </div>           <!-- hand back to result_* -->
  1035.     #
  1036.     # - Each title looks like
  1037.     #
  1038.     #   <h3 class="r">
  1039.     #     <!-- within the scope of title_* -->
  1040.     #     <span>                 <!-- filetype (optional), annotate as title_filetype,
  1041.     #                                 start_populating_textbuf -->
  1042.     #       file type (e.g. [PDF])
  1043.     #     </span>                <!-- stop_populating_textbuf -->
  1044.     #     <a href="result url">  <!-- register self.url, annotate as 'title_link',
  1045.     #                                 start_populating_textbuf -->
  1046.     #       result title
  1047.     #     </a>                   <!-- stop_populating_textbuf, pop to self.title -->
  1048.     #   </h3>
  1049.     #
  1050.     # - For each abstract, the first <span> tag with class "st" contains
  1051.     #   the body text of the abstract.
  1052.     #
  1053.     #   <!-- within the scope of abstract_* -->
  1054.     #   <span class="st">  <!-- annotate as 'abstract_text', start_populating_textbuf -->
  1055.     #     abstract text with <em> markup on keywords
  1056.     #   </span>            <!-- stop_populating_textbuf, pop to self.abstract -->
  1057.     #
  1058.     # - Certain results may come with sitelinks, secondary results that
  1059.     #   are usually subdomains or deep links within the primary
  1060.     #   result. They are organized into a <table> tag, and each sitelink
  1061.     #   is in a separate <td>:
  1062.     #
  1063.     #   <!-- within the scope of result_* -->
  1064.     #   <table>    <!-- annotate as 'sitelink_table', hand over to sitelink_table_* -->
  1065.     #     <tr>
  1066.     #       <td>   <!-- annotate as 'sitelink', hand over to sitelink_* -->
  1067.     #       </td>  <!-- append to self.sitelinks, hand back to sitelink_table_* -->
  1068.     #       <td></td>
  1069.     #       ...
  1070.     #     </tr>
  1071.     #     <tr></tr>
  1072.     #     ...
  1073.     #   </table>   <!-- hand back to result_* -->
  1074.     #
  1075.     #   Then for each sitelink, the hyperlinked title is in an <h3> tag
  1076.     #   with class "r", and the abstract is in a <div> tag with class
  1077.     #   "st". They are not necessarily on the same level, but we don't
  1078.     #   really care.
  1079.     #
  1080.     #   <!-- within the scope of sitelink_* -->
  1081.     #   <h3 class="r">             <!-- annotate as 'sitelink_title',
  1082.     #                                   hand over to sitelink_title_* -->
  1083.     #     <a href="sitelink url">  <!-- register sitelink url, annotate as 'sitelink_title_link',
  1084.     #                                   start_populating_textbuf -->
  1085.     #       sitelink title
  1086.     #     </a>                     <!-- stop_populating_textbuf, pop to sitelink title -->
  1087.     #   </h3>                      <!-- hand back to sitelink_* -->
  1088.     #
  1089.     #   <!-- still within the scope of sitelink_* -->
  1090.     #   <div class="st">  <!-- annotate as 'sitelink_abstract', start_populating_textbuf -->
  1091.     #     abstract text
  1092.     #   </div>            <!-- stop_populating_textbuf, pop to sitelink abstract -->
  1093.     #
  1094.     # - Sometimes Google autocorrects a query. Whenever this happens
  1095.     #   there will be a block whose English version reads "Showing
  1096.     #   results for ... <newline> Search instead for ...", and the HTML
  1097.     #   looks like
  1098.     #
  1099.     #   <span class="spell">Showing results for</span>
  1100.     #   <a class="spell" href="/search?q=google..."><b><i>google</i></b></a>
  1101.     #   <br>
  1102.     #   <span class="spell_orig"></span>
  1103.     #
  1104.     #   We collect the text inside a.spell as the suggested spelling
  1105.     #   (self.suggested_spelling).
  1106.     #
  1107.     #   Note that:
  1108.     #
  1109.     #   1. When npfr=1 (exact), there could still be an
  1110.     #      a.spell, in a block that reads (English version) "Did you mean:
  1111.     #      ...". Therefore, we only consider the query autocorrected when a
  1112.     #      meaningful .spell_orig is also present (self.autocorrected).
  1113.     #
  1114.     #   2. A few garbage display:none, empty tags related to spell
  1115.     #      appear to be always present: span#srfm.spell, a#srfl.spell,
  1116.     #      span#sifm.spell_orig, a#sifl.spell_orig. We need to exclude
  1117.     #      the ids srfm, srfl, sifm and sifl from our consideration.
  1118.     #
  1119.     # - Sometimes Google omits similar (more like duplicate) result
  1120.     #   entries. Whenever this happens there will be a notice in p#ofr. The way
  1121.     #   to unfilter is to simply add '&filter=0' to the query string.
  1122.     #
  1123.     #
  1124.     # Google News
  1125.     #
  1126.     # - Google News results differ from Google Search results in the
  1127.     #   following ways:
  1128.     #
  1129.     #   For each result, the title in the same format, but there's a
  1130.     #   metadata field in a <div> tag with class "slp", and the abstract
  1131.     #   isn't as deeply embedded: it's in a <div> tag on the same level
  1132.     #   with class "st".
  1133.     #
  1134.     #   <!-- within the scope of result_* -->
  1135.     #   <h3 class="r"></h3>  <!-- as before -->
  1136.     #   <div class="slp">    <!-- annotate as 'news_metadata', start_populating_textbuf -->
  1137.     #     ...
  1138.     #     <span>source</span>
  1139.     #     <span>-</span>     <!-- transform to ', ' -->
  1140.     #     <span>publishing time</span>
  1141.     #   </div>               <!-- stop_populating_textbuf, pop to self.metadata -->
  1142.     #   <div class="st">     <!-- annotate as 'news_abstract', start_populating_textbuf -->
  1143.     #     abstract text again with <em> markup on keywords
  1144.     #   </div>               <!-- stop_populating_textbuf, pop to self.abstract -->
  1145.     #
  1146.     #
  1147.     # Ignore List
  1148.     #
  1149.     # - As good as our result criteria might be, sometimes results of
  1150.     #   dubious value (usually from Google's value-add features) slip
  1151.     #   through. The "People also ask" feature is a good example of this
  1152.     #   type (a sample query is "VPN"; see screenshot
  1153.     #   https://i.imgur.com/yfcsoQz.png). In these cases, we may want to
  1154.     #   skip enclosing containers entirely. The ignore list feature is
  1155.     #   designed for this purpose.
  1156.     #
  1157.     #   The current ignore list is available in self.IGNORE_LIST. Each
  1158.     #   entry (called a "selector") is a dict of attribute-value
  1159.     #   pairs. Each attribute is matched verbatim to a tag's attribute,
  1160.     #   except the "class" attribute, where we test for inclusion
  1161.     #   instead (e.g. "c b a" matches "a b", just like it matches the
  1162.     #   CSS selector ".a.b"). There's also a special "attribute" -- tag,
  1163.     #   the meaning of which is obvious. A tag has to match all given
  1164.     #   attributes to be considered a match for the selector.
  1165.     #
  1166.     #   When a match is found, the tag is annotated as SCOPE_ignored,
  1167.     #   where SCOPE is the current handler scope (e.g., root, result,
  1168.     #   title, etc.), and the scope is switched to 'ignore'. All
  1169.     #   descendants of the tag are ignored. When the corresponding end
  1170.     #   tag is finally reach, the former scope is restored.
  1171.     #
  1172.     #
  1173.     # User Agent disabled (differences)
  1174.     #
  1175.     #   1. For Google News results, <div class="g"> is followed by <table> tag
  1176.     #       <div class="g">
  1177.     #           <table>
  1178.     #
  1179.     #   2. File mime type follows <div class="g">
  1180.     #       <div class="g"><span style="float:left"><span class="mime">[PDF]</span>&nbsp;</span>
  1181.     #
  1182.     #   3. News metadata (source and time) comes within a single tag
  1183.     #       <div class="slp"><span class="f">Reuters - 3 hours ago</span>
  1184.     #
  1185.     #   4. URLs are wrapped
  1186.     #       <a href="/url?q=http://...&sa=...">
  1187.     #
  1188.     #   5. URLs are quoted
  1189.     #       'https://vk.com/doc206446660_429188746%3Fhash%3D6097a8b0a41185cb90%26dl%3D03c63c1be5c02e8620'
  1190.     #
  1191.     #   6. Google Services links are returned as regular results,
  1192.     #      start with '/search?q=' but no following 'http' or 'https'
  1193.     #       <div class="g">
  1194.     #           <div>
  1195.     #               <h3 class="r"><a href="/search?q=india&...&sa=...">News for <b>india</b></a></h3>
  1196.     #
  1197.     #   7. YouTube specific results are returned within <table class="ts">
  1198.     #       e.g. search - '3 hours youtube'
  1199.     #
  1200.     #       <span class="st">
  1201.     #           <span class="f"><span class="nobr">10 Jun 2014</span> - <span class="nobr">179 min</span> -
  1202.     #               <span class="nobr">Uploaded by Meditation Relax Music</span>
  1203.     #           </span>
  1204.     #           <br><b>3 HOURS Best Relaxing Music</b> &#39;Romantic <b>Piano</b>&quot; Background <b>Music</b> for Stress ... 3:03 <b>...</b>
  1205.     #       </span>
  1206.     #
  1207.     #   8. There's no a.spell_orig when the query is autocorrected; the
  1208.     #      <a> tag (linking to the exact search) is wrapped in the
  1209.     #      span.spell_orig.
  1210.  
  1211.     def __init__(self, news=False):
  1212.         html.parser.HTMLParser.__init__(self)
  1213.  
  1214.         self.news = news
  1215.  
  1216.         self.autocorrected = False
  1217.         self.suggested_spelling = None
  1218.         self.filtered = False
  1219.         self.results = []
  1220.  
  1221.         self.index = 0
  1222.         self.textbuf = ''
  1223.         self.tag_annotations = {}
  1224.  
  1225.         self.set_handlers_to('root')
  1226.  
  1227.     # Ignore list
  1228.     IGNORE_LIST = [
  1229.         # "People also ask"
  1230.         # Sample query: VPN
  1231.         # Screenshot: https://i.imgur.com/yfcsoQz.png
  1232.         {
  1233.             'tag': 'div',
  1234.             'class': 'related-question-pair'
  1235.         },
  1236.         # We omit Google's "smart card" results (term coined by me) by
  1237.         # guarding against the 'g-blk' class (sample response: https://git.io/voJgB)
  1238.         {
  1239.             'tag': 'div',
  1240.             'class': 'g-blk'
  1241.         },
  1242.         # We also guard against "smart-card" results with `--noua` option
  1243.         {
  1244.             'tag': 'div',
  1245.             'class': 'hp-xpdbox'
  1246.         }
  1247.     ]
  1248.  
  1249.     # Tag handlers
  1250.  
  1251.     @annotate_tag
  1252.     def root_start(self, tag, attrs):
  1253.         if tag == 'div' and 'g' in self.classes(attrs):
  1254.             # Initialize result field registers
  1255.             self.title = ''
  1256.             self.url = ''
  1257.             self.abstract = ''
  1258.             self.metadata = ''  # Only used for Google News
  1259.             self.sitelinks = []
  1260.  
  1261.             # Guard against sitelinks, which also have titles and
  1262.             # abstracts.  In the case of news, guard against "card
  1263.             # sections" (secondary results to the same event).
  1264.             self.title_registered = False
  1265.             self.abstract_registered = False
  1266.             self.metadata_registered = False  # Only used for Google News
  1267.  
  1268.             self.set_handlers_to('result')
  1269.             return 'result'
  1270.  
  1271.         # Autocorrect
  1272.         if tag == 'span' and 'spell_orig' in self.classes(attrs) and attrs.get('id') != 'sifm':
  1273.             self.autocorrected = True
  1274.             return
  1275.         if tag == 'a' and 'spell' in self.classes(attrs) and attrs.get('id') != 'srfl':
  1276.             self.start_populating_textbuf()
  1277.             return 'spell'
  1278.  
  1279.         # Omitted results
  1280.         if tag == 'p' and attrs.get('id') == 'ofr':
  1281.             self.filtered = True
  1282.  
  1283.     @retrieve_tag_annotation
  1284.     def root_end(self, tag, annotation):
  1285.         if annotation == 'spell':
  1286.             self.stop_populating_textbuf()
  1287.             self.suggested_spelling = self.pop_textbuf()
  1288.  
  1289.     @annotate_tag
  1290.     def result_start(self, tag, attrs):
  1291.         if not ua and tag == 'span' and 'mime' in self.classes(attrs):
  1292.             self.start_populating_textbuf()
  1293.             return 'title_filetype'
  1294.  
  1295.         if not self.title_registered and tag == 'h3' and 'r' in self.classes(attrs):
  1296.             self.set_handlers_to('title')
  1297.             return 'title'
  1298.  
  1299.         if not self.abstract_registered and tag == 'div' and 's' in self.classes(attrs):
  1300.             self.set_handlers_to('abstract')
  1301.             return 'abstract'
  1302.  
  1303.         if not ua and not self.abstract_registered \
  1304.                 and tag == 'span' and 'st' in self.classes(attrs):
  1305.             self.start_populating_textbuf(lambda text: text + ' ')
  1306.             return 'abstract_gservices'
  1307.  
  1308.         if not self.sitelinks and tag == 'table':
  1309.             if ua or (not self.news and 'ts' not in self.classes(attrs)):
  1310.                 self.set_handlers_to('sitelink_table')
  1311.                 return 'sitelink_table'
  1312.  
  1313.         if self.news:
  1314.             if not self.metadata_registered and tag == 'div' and 'slp' in self.classes(attrs):
  1315.                 # Change metadata field separator from '-' to ', ' for better appearance
  1316.                 if ua:
  1317.                     self.start_populating_textbuf(lambda text: ', ' if text == '-' else text)
  1318.                 else:
  1319.                     self.start_populating_textbuf(lambda text:
  1320.                                                   text.replace(' -', ',', 1) if ' - ' in text else text)
  1321.                 return 'news_metadata'
  1322.  
  1323.             if not self.abstract_registered and tag == 'div' and 'st' in self.classes(attrs):
  1324.                 self.start_populating_textbuf()
  1325.                 return 'news_abstract'
  1326.  
  1327.     @retrieve_tag_annotation
  1328.     def result_end(self, tag, annotation):
  1329.         if annotation == 'result':
  1330.             if self.url:
  1331.                 self.index += 1
  1332.                 result = Result(self.index, self.title, self.url, self.abstract,
  1333.                                 metadata=self.metadata if self.metadata else None,
  1334.                                 sitelinks=self.sitelinks)
  1335.                 self.results.append(result)
  1336.             self.set_handlers_to('root')
  1337.         elif annotation == 'news_metadata':
  1338.             self.stop_populating_textbuf()
  1339.             self.metadata = self.pop_textbuf()
  1340.             self.metadata_registered = True
  1341.         elif annotation == 'news_abstract':
  1342.             self.stop_populating_textbuf()
  1343.             self.abstract = self.pop_textbuf()
  1344.             self.abstract_registered = True
  1345.         elif annotation == 'abstract_gservices':
  1346.             self.stop_populating_textbuf()
  1347.             self.abstract = self.pop_textbuf().replace('  ', ' ')
  1348.             self.abstract_registered = True
  1349.  
  1350.     @annotate_tag
  1351.     def title_start(self, tag, attrs):
  1352.         if ua and tag == 'span':
  1353.             # Print a space after the filetype indicator
  1354.             self.start_populating_textbuf(lambda text: text + ' ')
  1355.             return 'title_filetype'
  1356.         if tag == 'a' and 'href' in attrs:
  1357.             # Skip 'News for', 'Images for' search links
  1358.             if attrs['href'].startswith('/search'):
  1359.                 return
  1360.  
  1361.             # Skip card results
  1362.             if not ua and "fl" in self.classes(attrs):
  1363.                 return
  1364.  
  1365.             self.url = attrs['href']
  1366.             try:
  1367.                 start = self.url.index('?q=') + len('?q=')
  1368.                 end = self.url.index('&sa=', start)
  1369.                 self.url = urllib.parse.unquote_plus(self.url[start:end])
  1370.             except ValueError:
  1371.                 pass
  1372.             self.start_populating_textbuf()
  1373.             return 'title_link'
  1374.  
  1375.     @retrieve_tag_annotation
  1376.     def title_end(self, tag, annotation):
  1377.         if annotation == 'title_filetype':
  1378.             self.stop_populating_textbuf()
  1379.         elif annotation == 'title_link':
  1380.             self.stop_populating_textbuf()
  1381.             self.title = self.pop_textbuf()
  1382.             self.title_registered = True
  1383.         elif annotation == 'title':
  1384.             self.set_handlers_to('result')
  1385.  
  1386.     @annotate_tag
  1387.     def abstract_start(self, tag, attrs):
  1388.         if (not self.metadata_registered and
  1389.                 tag == 'div' and 'slp' in self.classes(attrs)):
  1390.             self.start_populating_textbuf()
  1391.             return 'result_metadata'
  1392.         if tag == 'span' and 'st' in self.classes(attrs):
  1393.             self.start_populating_textbuf()
  1394.             return 'abstract_text'
  1395.  
  1396.     @retrieve_tag_annotation
  1397.     def abstract_end(self, tag, annotation):
  1398.         if annotation == 'result_metadata':
  1399.             self.stop_populating_textbuf()
  1400.             self.metadata = self.pop_textbuf().strip().replace('\u200e', '')
  1401.             self.metadata_registered = True
  1402.         elif annotation == 'abstract_text':
  1403.             self.stop_populating_textbuf()
  1404.             self.abstract = self.pop_textbuf()
  1405.             self.abstract_registered = True
  1406.         elif annotation == 'abstract':
  1407.             self.set_handlers_to('result')
  1408.  
  1409.     @annotate_tag
  1410.     def sitelink_table_start(self, tag, attrs):
  1411.         if tag == 'td':
  1412.             # Initialize a new sitelink
  1413.             self.current_sitelink = Sitelink('', '', '')
  1414.             self.set_handlers_to('sitelink')
  1415.             return 'sitelink'
  1416.  
  1417.     @retrieve_tag_annotation
  1418.     def sitelink_table_end(self, tag, annotation):
  1419.         if annotation == 'sitelink_table':
  1420.             self.set_handlers_to('result')
  1421.  
  1422.     @annotate_tag
  1423.     def sitelink_start(self, tag, attrs):
  1424.         if tag == 'h3' and 'r' in self.classes(attrs):
  1425.             self.set_handlers_to('sitelink_title')
  1426.             return 'sitelink_title'
  1427.         if tag == 'div' and 'st' in self.classes(attrs):
  1428.             self.start_populating_textbuf()
  1429.             return 'sitelink_abstract'
  1430.  
  1431.     @retrieve_tag_annotation
  1432.     def sitelink_end(self, tag, annotation):
  1433.         if annotation == 'sitelink_abstract':
  1434.             self.stop_populating_textbuf()
  1435.             self.current_sitelink.abstract = self.pop_textbuf()
  1436.         elif annotation == 'sitelink':
  1437.             if self.current_sitelink.url:
  1438.                 self.sitelinks.append(self.current_sitelink)
  1439.             self.set_handlers_to('sitelink_table')
  1440.  
  1441.     @annotate_tag
  1442.     def sitelink_title_start(self, tag, attrs):
  1443.         if tag == 'a' and 'href' in attrs:
  1444.             self.current_sitelink.url = attrs['href']
  1445.             try:
  1446.                 start = self.current_sitelink.url.index('?q=') + len('?q=')
  1447.                 end = self.current_sitelink.url.index('&sa=', start)
  1448.                 self.current_sitelink.url = urllib.parse.unquote_plus(self.current_sitelink.url[start:end])
  1449.             except ValueError:
  1450.                 pass
  1451.             self.start_populating_textbuf()
  1452.             return 'sitelink_title_link'
  1453.  
  1454.     @retrieve_tag_annotation
  1455.     def sitelink_title_end(self, tag, annotation):
  1456.         if annotation == 'sitelink_title_link':
  1457.             self.stop_populating_textbuf()
  1458.             self.current_sitelink.title = self.pop_textbuf()
  1459.         elif annotation == 'sitelink_title':
  1460.             self.set_handlers_to('sitelink')
  1461.  
  1462.     # Generic methods
  1463.  
  1464.     # Set handle_starttag to SCOPE_start, and handle_endtag to SCOPE_end.
  1465.     def set_handlers_to(self, scope):
  1466.         self.handle_starttag = getattr(self, scope + '_start')
  1467.         self.handle_endtag = getattr(self, scope + '_end')
  1468.  
  1469.     def insert_annotation(self, tag, annotation):
  1470.         if tag not in self.tag_annotations:
  1471.             self.tag_annotations[tag] = []
  1472.         self.tag_annotations[tag].append(annotation)
  1473.  
  1474.     @annotate_tag
  1475.     def ignore_start(self, tag, attrs):
  1476.         pass
  1477.  
  1478.     @retrieve_tag_annotation
  1479.     def ignore_end(self, tag, annotation):
  1480.         if annotation and annotation.endswith('_ignored'):
  1481.             # Strip '-ignore' suffix from annotation to obtain the outer
  1482.             # context name.
  1483.             context = annotation[:-8]
  1484.             self.set_handlers_to(context)
  1485.  
  1486.     def start_populating_textbuf(self, data_transformer=None):
  1487.         if data_transformer is None:
  1488.             # Record data verbatim
  1489.             self.handle_data = self.record_data
  1490.         else:
  1491.             def record_transformed_data(data):
  1492.                 self.textbuf += data_transformer(data)
  1493.  
  1494.             self.handle_data = record_transformed_data
  1495.  
  1496.         self.handle_entityref = self.record_entityref
  1497.         self.handle_charref = self.record_charref
  1498.  
  1499.     def pop_textbuf(self):
  1500.         text = self.textbuf
  1501.         self.textbuf = ''
  1502.         return text
  1503.  
  1504.     def stop_populating_textbuf(self):
  1505.         self.handle_data = lambda data: None
  1506.         self.handle_entityref = lambda ref: None
  1507.         self.handle_charref = lambda ref: None
  1508.  
  1509.     def record_data(self, data):
  1510.         self.textbuf += data
  1511.  
  1512.     def record_entityref(self, ref):
  1513.         try:
  1514.             self.textbuf += chr(html.entities.name2codepoint[ref])
  1515.         except KeyError:
  1516.             # Entity name not found; most likely rather sloppy HTML
  1517.             # where a literal ampersand is not escaped; For instance,
  1518.             # the HTML response returned by
  1519.             #
  1520.             #     googler -c au -l ko expected
  1521.             #
  1522.             # contains the following tag
  1523.             #
  1524.             #     <p class="_e4b"><a href="...">expected market return s&p 500</a></p>
  1525.             #
  1526.             # where &p is interpreted by HTMLParser as an entity (this
  1527.             # behaviour seems to be specific to Python 2.7).
  1528.             self.textbuf += '&' + ref
  1529.  
  1530.     def record_charref(self, ref):
  1531.         if ref.startswith('x'):
  1532.             char = chr(int(ref[1:], 16))
  1533.         else:
  1534.             char = chr(int(ref))
  1535.         self.textbuf += char
  1536.  
  1537.     @staticmethod
  1538.     def classes(attrs):
  1539.         """Get tag's classes from its attribute dict."""
  1540.         return attrs.get('class', '').split()
  1541.  
  1542.  
  1543. class Sitelink(object):
  1544.     """Container for a sitelink."""
  1545.  
  1546.     def __init__(self, title, url, abstract):
  1547.         self.title = title
  1548.         self.url = url
  1549.         self.abstract = abstract
  1550.         self.index = ''
  1551.  
  1552.  
  1553. Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')
  1554.  
  1555.  
  1556. class Result(object):
  1557.     """
  1558.    Container for one search result, with output helpers.
  1559.  
  1560.    Parameters
  1561.    ----------
  1562.    index : int or str
  1563.    title : str
  1564.    url : str
  1565.    abstract : str
  1566.    metadata : str, optional
  1567.        Only applicable to Google News results, with publisher name and
  1568.        publishing time.
  1569.    sitelinks : list, optional
  1570.        List of ``SiteLink`` objects.
  1571.  
  1572.    Attributes
  1573.    ----------
  1574.    index : str
  1575.    title : str
  1576.    url : str
  1577.    abstract : str
  1578.    metadata : str or None
  1579.    sitelinks : list
  1580.  
  1581.    Class Variables
  1582.    ---------------
  1583.    colors : str
  1584.  
  1585.    Methods
  1586.    -------
  1587.    print()
  1588.    jsonizable_object()
  1589.    urltable()
  1590.  
  1591.    """
  1592.  
  1593.     # Class variables
  1594.     colors = None
  1595.     urlexpand = True
  1596.  
  1597.     def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None):
  1598.         index = str(index)
  1599.         self.index = index
  1600.         self.title = title
  1601.         self.url = url
  1602.         self.abstract = abstract
  1603.         self.metadata = metadata
  1604.         self.sitelinks = [] if sitelinks is None else sitelinks
  1605.  
  1606.         self._urltable = {index: url}
  1607.         subindex = 'a'
  1608.         for sitelink in sitelinks:
  1609.             fullindex = index + subindex
  1610.             sitelink.index = fullindex
  1611.             self._urltable[fullindex] = sitelink.url
  1612.             subindex = chr(ord(subindex) + 1)
  1613.  
  1614.     def _print_title_and_url(self, index, title, url, indent=0):
  1615.         colors = self.colors
  1616.  
  1617.         if not self.urlexpand:
  1618.             segments = urllib.parse.urlparse(url)
  1619.             url = '  [' + segments.netloc + ']'
  1620.  
  1621.         # Pad index and url with `indent` number of spaces
  1622.         index = ' ' * indent + str(index)
  1623.         url = ' ' * indent + url
  1624.         if colors:
  1625.             print(colors.index + index + colors.reset, end='')
  1626.             if self.urlexpand:
  1627.                 print(' ' + colors.title + title + colors.reset)
  1628.                 print(colors.url + url + colors.reset)
  1629.             else:
  1630.                 print(' ' + colors.title + title + colors.reset + colors.url + url + colors.reset)
  1631.         else:
  1632.             if self.urlexpand:
  1633.                 print(' %s %s\n%s' % (index, title, url))
  1634.             else:
  1635.                 print(' %s %s%s' % (index, title, url))
  1636.  
  1637.     def _print_metadata_and_abstract(self, abstract, metadata=None, indent=0):
  1638.         colors = self.colors
  1639.         try:
  1640.             columns, _ = os.get_terminal_size()
  1641.         except OSError:
  1642.             columns = 0
  1643.  
  1644.         if metadata:
  1645.             if colors:
  1646.                 print(colors.metadata + metadata + colors.reset)
  1647.             else:
  1648.                 print(metadata)
  1649.  
  1650.         if colors:
  1651.             print(colors.abstract, end='')
  1652.         if columns > indent + 1:
  1653.             # Try to fill to columns
  1654.             fillwidth = columns - indent - 1
  1655.             for line in textwrap.wrap(abstract.replace('\n', ''), width=fillwidth):
  1656.                 print('%s%s' % (' ' * indent, line))
  1657.             print('')
  1658.         else:
  1659.             print('%s\n' % abstract.replace('\n', ' '))
  1660.         if colors:
  1661.             print(colors.reset, end='')
  1662.  
  1663.     def print(self):
  1664.         """Print the result entry."""
  1665.         self._print_title_and_url(self.index, self.title, self.url)
  1666.         self._print_metadata_and_abstract(self.abstract, metadata=self.metadata)
  1667.  
  1668.         for sitelink in self.sitelinks:
  1669.             self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
  1670.             self._print_metadata_and_abstract(sitelink.abstract, indent=4)
  1671.  
  1672.     def jsonizable_object(self):
  1673.         """Return a JSON-serializable dict representing the result entry."""
  1674.         obj = {
  1675.             'title': self.title,
  1676.             'url': self.url,
  1677.             'abstract': self.abstract
  1678.         }
  1679.         if self.metadata:
  1680.             obj['metadata'] = self.metadata
  1681.         if self.sitelinks:
  1682.             obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
  1683.         return obj
  1684.  
  1685.     def urltable(self):
  1686.         """Return a index-to-URL table for the current result.
  1687.  
  1688.        Normally, the table contains only a single entry, but when the result
  1689.        contains sitelinks, all sitelinks are included in this table.
  1690.  
  1691.        Returns
  1692.        -------
  1693.        dict
  1694.            A dict mapping indices (strs) to URLs (also strs). Indices of
  1695.            sitelinks are the original index appended by lowercase letters a,
  1696.            b, c, etc.
  1697.  
  1698.        """
  1699.         return self._urltable
  1700.  
  1701.  
  1702. class GooglerCmdException(Exception):
  1703.     pass
  1704.  
  1705.  
  1706. class NoKeywordsException(GooglerCmdException):
  1707.     pass
  1708.  
  1709.  
  1710. def require_keywords(method):
  1711.     # Require keywords to be set before we run a GooglerCmd method. If
  1712.     # no keywords have been set, raise a NoKeywordsException.
  1713.     @functools.wraps(method)
  1714.     def enforced_method(self, *args, **kwargs):
  1715.         if not self.keywords:
  1716.             raise NoKeywordsException('No keywords.')
  1717.         method(self, *args, **kwargs)
  1718.  
  1719.     return enforced_method
  1720.  
  1721.  
  1722. def no_argument(method):
  1723.     # Normalize a do_* method of GooglerCmd that takes no argument to
  1724.     # one that takes an arg, but issue a warning when an nonempty
  1725.     # argument is given.
  1726.     @functools.wraps(method)
  1727.     def enforced_method(self, arg):
  1728.         if arg:
  1729.             method_name = arg.__name__
  1730.             command_name = method_name[3:] if method_name.startswith('do_') else method_name
  1731.             logger.warning("Argument to the '%s' command ignored.", command_name)
  1732.         method(self)
  1733.  
  1734.     return enforced_method
  1735.  
  1736.  
  1737. class GooglerCmd(object):
  1738.     """
  1739.    Command line interpreter and executor class for googler.
  1740.  
  1741.    Inspired by PSL cmd.Cmd.
  1742.  
  1743.    Parameters
  1744.    ----------
  1745.    opts : argparse.Namespace
  1746.        Options and/or arguments.
  1747.  
  1748.    Attributes
  1749.    ----------
  1750.    options : argparse.Namespace
  1751.        Options that are currently in effect. Read-only attribute.
  1752.    keywords : str or list or strs
  1753.        Current keywords. Read-only attribute
  1754.  
  1755.    Methods
  1756.    -------
  1757.    fetch()
  1758.    display_results(prelude='\n', json_output=False)
  1759.    fetch_and_display(prelude='\n', json_output=False, interactive=True)
  1760.    read_next_command()
  1761.    help()
  1762.    cmdloop()
  1763.    """
  1764.  
  1765.     # Class variables
  1766.     colors = None
  1767.  
  1768.     def __init__(self, opts):
  1769.         super().__init__()
  1770.  
  1771.         self._opts = opts
  1772.  
  1773.         self._google_url = GoogleUrl(opts)
  1774.         proxy = opts.proxy if hasattr(opts, 'proxy') else None
  1775.         self._conn = GoogleConnection(self._google_url.hostname, proxy=proxy,
  1776.                                       notweak=opts.notweak)
  1777.         atexit.register(self._conn.close)
  1778.  
  1779.         self.results = []
  1780.         self._autocorrected_to = None
  1781.         self._results_filtered = False
  1782.         self._urltable = {}
  1783.  
  1784.         self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False
  1785.  
  1786.     @property
  1787.     def options(self):
  1788.         """Current options."""
  1789.         return self._opts
  1790.  
  1791.     @property
  1792.     def keywords(self):
  1793.         """Current keywords."""
  1794.         return self._google_url.keywords
  1795.  
  1796.     @require_keywords
  1797.     def fetch(self):
  1798.         """Fetch a page and parse for results.
  1799.  
  1800.        Results are stored in ``self.results``.
  1801.  
  1802.        Raises
  1803.        ------
  1804.        GoogleConnectionError
  1805.  
  1806.        See Also
  1807.        --------
  1808.        fetch_and_display
  1809.  
  1810.        """
  1811.         # This method also sets self._results_filtered and
  1812.         # self._urltable.
  1813.         page = self._conn.fetch_page(self._google_url.relative())
  1814.  
  1815.         if logger.isEnabledFor(logging.DEBUG):
  1816.             import tempfile
  1817.             fd, tmpfile = tempfile.mkstemp(prefix='googler-response-')
  1818.             os.close(fd)
  1819.             with open(tmpfile, 'w', encoding='utf-8') as fp:
  1820.                 fp.write(page)
  1821.             logger.debug("Response body written to '%s'.", tmpfile)
  1822.  
  1823.         parser = GoogleParser(news=self._google_url.news)
  1824.         parser.feed(page)
  1825.  
  1826.         self.results = parser.results
  1827.         self._autocorrected_to = parser.suggested_spelling if parser.autocorrected else None
  1828.         self._results_filtered = parser.filtered
  1829.         self._urltable = {}
  1830.         for r in self.results:
  1831.             self._urltable.update(r.urltable())
  1832.  
  1833.     @require_keywords
  1834.     def display_results(self, prelude='\n', json_output=False):
  1835.         """Display results stored in ``self.results``.
  1836.  
  1837.        Parameters
  1838.        ----------
  1839.        See `fetch_and_display`.
  1840.  
  1841.        """
  1842.         if json_output:
  1843.             # JSON output
  1844.             import json
  1845.             results_object = [r.jsonizable_object() for r in self.results]
  1846.             print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))
  1847.         else:
  1848.             # Regular output
  1849.             if not self.results:
  1850.                 print('No results.', file=sys.stderr)
  1851.             else:
  1852.                 sys.stderr.write(prelude)
  1853.                 for r in self.results:
  1854.                     r.print()
  1855.  
  1856.     @require_keywords
  1857.     def fetch_and_display(self, prelude='\n', json_output=False, interactive=True):
  1858.         """Fetch a page and display results.
  1859.  
  1860.        Results are stored in ``self.results``.
  1861.  
  1862.        Parameters
  1863.        ----------
  1864.        prelude : str, optional
  1865.            A string that is written to stderr before showing actual results,
  1866.            usually serving as a separator. Default is an empty line.
  1867.        json_output : bool, optional
  1868.            Whether to dump results in JSON format. Default is False.
  1869.        interactive : bool, optional
  1870.            Whether to show contextual instructions, when e.g. Google
  1871.            has filtered the results. Default is True.
  1872.  
  1873.        Raises
  1874.        ------
  1875.        GoogleConnectionError
  1876.  
  1877.        See Also
  1878.        --------
  1879.        fetch
  1880.        display_results
  1881.  
  1882.        """
  1883.         self.fetch()
  1884.         colors = self.colors
  1885.         if self._autocorrected_to:
  1886.             if colors:
  1887.                 # Underline the keywords
  1888.                 autocorrected_to = '\x1b[4m' + self._autocorrected_to + '\x1b[24m'
  1889.             else:
  1890.                 autocorrected_to = self._autocorrected_to
  1891.             autocorrect_info = ('Showing results for %s; enter "x" for an exact search.' %
  1892.                                 autocorrected_to)
  1893.             printerr('')
  1894.             if colors:
  1895.                 printerr(colors.prompt + autocorrect_info + colors.reset)
  1896.             else:
  1897.                 printerr('** ' + autocorrect_info)
  1898.         self.display_results(prelude=prelude, json_output=json_output)
  1899.         if self._results_filtered:
  1900.             unfilter_info = 'Enter "unfilter" to show similar results Google omitted.'
  1901.             if colors:
  1902.                 printerr(colors.prompt + unfilter_info + colors.reset)
  1903.             else:
  1904.                 printerr('** ' + unfilter_info)
  1905.             printerr('')
  1906.  
  1907.     def read_next_command(self):
  1908.         """Show omniprompt and read user command line.
  1909.  
  1910.        Command line is always stripped, and each consecutive group of
  1911.        whitespace is replaced with a single space character. If the
  1912.        command line is empty after stripping, when ignore it and keep
  1913.        reading. Exit with status 0 if we get EOF or an empty line
  1914.        (pre-strip, that is, a raw <enter>) twice in a row.
  1915.  
  1916.        The new command line (non-empty) is stored in ``self.cmd``.
  1917.  
  1918.        """
  1919.         colors = self.colors
  1920.         message = 'googler (? for help)'
  1921.         prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ')
  1922.         enter_count = 0
  1923.         while True:
  1924.             try:
  1925.                 cmd = input(prompt)
  1926.             except EOFError:
  1927.                 sys.exit(0)
  1928.  
  1929.             if not cmd:
  1930.                 enter_count += 1
  1931.                 if enter_count == 2:
  1932.                     # Double <enter>
  1933.                     sys.exit(0)
  1934.             else:
  1935.                 enter_count = 0
  1936.  
  1937.             cmd = ' '.join(cmd.split())
  1938.             if cmd:
  1939.                 self.cmd = cmd
  1940.                 break
  1941.  
  1942.     @staticmethod
  1943.     def help():
  1944.         GooglerArgumentParser.print_omniprompt_help(sys.stderr)
  1945.         printerr('')
  1946.  
  1947.     @require_keywords
  1948.     @no_argument
  1949.     def do_first(self):
  1950.         try:
  1951.             self._google_url.first_page()
  1952.         except ValueError as e:
  1953.             print(e, file=sys.stderr)
  1954.             return
  1955.  
  1956.         self.fetch_and_display()
  1957.  
  1958.     def do_google(self, arg):
  1959.         # Update keywords and reconstruct URL
  1960.         self._opts.keywords = arg
  1961.         self._google_url = GoogleUrl(self._opts)
  1962.         self.fetch_and_display()
  1963.  
  1964.     @require_keywords
  1965.     @no_argument
  1966.     def do_next(self):
  1967.         # If > 5 results are being fetched each time,
  1968.         # block next when no parsed results in current fetch
  1969.         if not self.results and self._google_url._num > 5:
  1970.             printerr('No results.')
  1971.         else:
  1972.             self._google_url.next_page()
  1973.             self.fetch_and_display()
  1974.  
  1975.     @require_keywords
  1976.     def do_open(self, *args):
  1977.         if not args:
  1978.             open_url(self._google_url.full())
  1979.             return
  1980.  
  1981.         for nav in args:
  1982.             if nav == 'a':
  1983.                 for key, value in sorted(self._urltable.items()):
  1984.                     open_url(self._urltable[key])
  1985.             elif nav in self._urltable:
  1986.                 open_url(self._urltable[nav])
  1987.             elif '-' in nav:
  1988.                 try:
  1989.                     vals = [int(x) for x in nav.split('-')]
  1990.                     if (len(vals) != 2):
  1991.                         printerr('Invalid range %s.' % nav)
  1992.                         continue
  1993.  
  1994.                     if vals[0] > vals[1]:
  1995.                         vals[0], vals[1] = vals[1], vals[0]
  1996.  
  1997.                     for _id in range(vals[0], vals[1] + 1):
  1998.                         if str(_id) in self._urltable:
  1999.                             open_url(self._urltable[str(_id)])
  2000.                         else:
  2001.                             printerr('Invalid index %s.' % _id)
  2002.                 except ValueError:
  2003.                     printerr('Invalid range %s.' % nav)
  2004.             else:
  2005.                 printerr('Invalid index %s.' % nav)
  2006.  
  2007.     @require_keywords
  2008.     @no_argument
  2009.     def do_previous(self):
  2010.         try:
  2011.             self._google_url.prev_page()
  2012.         except ValueError as e:
  2013.             print(e, file=sys.stderr)
  2014.             return
  2015.  
  2016.         self.fetch_and_display()
  2017.  
  2018.     @require_keywords
  2019.     @no_argument
  2020.     def do_exact(self):
  2021.         # Reset start to 0 when exact is applied.
  2022.         self._google_url.update(start=0, exact=True)
  2023.         self.fetch_and_display()
  2024.  
  2025.     @require_keywords
  2026.     @no_argument
  2027.     def do_unfilter(self):
  2028.         # Reset start to 0 when unfilter is applied.
  2029.         self._google_url.update(start=0)
  2030.         self._google_url.set_queries(filter=0)
  2031.         self.fetch_and_display()
  2032.  
  2033.     def cmdloop(self):
  2034.         """Run REPL."""
  2035.         if self.keywords:
  2036.             self.fetch_and_display()
  2037.         else:
  2038.             printerr('Please initiate a query.')
  2039.  
  2040.         while True:
  2041.             self.read_next_command()
  2042.             # TODO: Automatic dispatcher
  2043.             #
  2044.             # We can't write a dispatcher for now because that could
  2045.             # change behaviour of the prompt. However, we have already
  2046.             # laid a lot of ground work for the dispatcher, e.g., the
  2047.             # `no_argument' decorator.
  2048.             try:
  2049.                 cmd = self.cmd
  2050.                 if cmd == 'f':
  2051.                     self.do_first('')
  2052.                 elif cmd.startswith('g '):
  2053.                     self.do_google(cmd[2:])
  2054.                 elif cmd == 'n':
  2055.                     self.do_next('')
  2056.                 elif cmd == 'o':
  2057.                     self.do_open()
  2058.                 elif cmd.startswith('o '):
  2059.                     self.do_open(*cmd[2:].split())
  2060.                 elif cmd.startswith('O '):
  2061.                     open_url.override_text_browser = True
  2062.                     self.do_open(*cmd[2:].split())
  2063.                     open_url.override_text_browser = False
  2064.                 elif cmd == 'p':
  2065.                     self.do_previous('')
  2066.                 elif cmd == 'q':
  2067.                     break
  2068.                 elif cmd == 'x':
  2069.                     self.do_exact('')
  2070.                 elif cmd == 'unfilter':
  2071.                     self.do_unfilter('')
  2072.                 elif cmd == '?':
  2073.                     self.help()
  2074.                 elif cmd in self._urltable:
  2075.                     open_url(self._urltable[cmd])
  2076.                 elif self.keywords and cmd.isdigit() and int(cmd) < 100:
  2077.                     printerr('Index out of bound. To search for the number, use g.')
  2078.                 elif cmd == 'u':
  2079.                     Result.urlexpand = not Result.urlexpand
  2080.                     printerr('url expansion toggled.')
  2081.                 elif cmd.startswith('c ') and cmd[2:].isdigit():
  2082.                     try:
  2083.                         # try copying the url to clipboard using native utilities
  2084.                         copier_params = []
  2085.                         copier_mode = 'stdin'
  2086.                         if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
  2087.                             if shutil.which('xsel') is not None:
  2088.                                 copier_params = ['xsel', '-b', '-i']
  2089.                             elif shutil.which('xclip') is not None:
  2090.                                 copier_params = ['xclip', '-selection', 'clipboard']
  2091.                             elif shutil.which('termux-clipboard-set') is not None:
  2092.                                 copier_params = ['termux-clipboard-set']
  2093.                         elif sys.platform == 'darwin':
  2094.                             copier_params = ['pbcopy']
  2095.                         elif sys.platform == 'win32':
  2096.                             copier_params = ['clip']
  2097.  
  2098.                         # If native clipboard utilities are absent, try to use terminal
  2099.                         # multiplexers, tmux/GNU screen, as fallback.
  2100.                         if not copier_params:
  2101.                             if os.getenv('TMUX_PANE'):  # check for tmux
  2102.                                 # Try to use tmux buffer as fallback. Use case suggested by #230.
  2103.                                 copier_params = ['tmux', 'set-buffer']
  2104.                                 copier_mode = 'cmdline_arg'
  2105.                             elif os.getenv('STY'):  # check for GNU screen
  2106.                                 # Try to use GNU screen's exchange-file as fallback.
  2107.                                 copier_params = ['screen', '-X', 'readbuf']
  2108.                                 copier_mode = 'ext_file'
  2109.  
  2110.                         if not copier_params:
  2111.                             printerr('failed to locate suitable clipboard utility')
  2112.                         else:
  2113.                             content = self._urltable[cmd[2:]].encode('utf-8')
  2114.                             if copier_mode == 'stdin':
  2115.                                 Popen(copier_params, stdin=PIPE,
  2116.                                       stdout=DEVNULL, stderr=DEVNULL).communicate(content)
  2117.                             elif copier_mode == 'cmdline_arg':
  2118.                                 Popen(copier_params + [content], stdin=DEVNULL,
  2119.                                       stdout=DEVNULL, stderr=DEVNULL).communicate()
  2120.                             else:
  2121.                                 with open('/tmp/screen-exchange', 'wb') as f:
  2122.                                     f.write(content)
  2123.                                 Popen(copier_params, stdin=DEVNULL,
  2124.                                       stdout=DEVNULL, stderr=DEVNULL).communicate()
  2125.                     except Exception:
  2126.                         raise NoKeywordsException
  2127.                 else:
  2128.                     self.do_google(cmd)
  2129.             except NoKeywordsException:
  2130.                 printerr('Initiate a query first.')
  2131.  
  2132.  
  2133. class GooglerArgumentParser(argparse.ArgumentParser):
  2134.     """Custom argument parser for googler."""
  2135.  
  2136.     # Print omniprompt help
  2137.     @staticmethod
  2138.     def print_omniprompt_help(file=None):
  2139.         file = sys.stderr if file is None else file
  2140.         file.write(textwrap.dedent("""
  2141.        omniprompt keys:
  2142.          n, p                  fetch the next or previous set of search results
  2143.          index                 open the result corresponding to index in browser
  2144.          f                     jump to the first page
  2145.          o [index|range|a ...] open space-separated result indices, numeric ranges
  2146.                                (sitelinks unsupported in ranges), or all, in browser
  2147.                                open the current search in browser, if no arguments
  2148.          O [index|range|a ...] like key 'o', but try to open in a GUI browser
  2149.          g keywords            new Google search for 'keywords' with original options
  2150.                                should be used to search omniprompt keys and indices
  2151.          c index               copy url to clipboard
  2152.          u                     toggle url expansion
  2153.          q, ^D, double Enter   exit googler
  2154.          ?                     show omniprompt help
  2155.          *                     other inputs issue a new search with original options
  2156.        """))
  2157.  
  2158.     # Print information on googler
  2159.     @staticmethod
  2160.     def print_general_info(file=None):
  2161.         file = sys.stderr if file is None else file
  2162.         file.write(textwrap.dedent("""
  2163.        Version %s
  2164.        Copyright © 2008 Henri Hakkinen
  2165.        Copyright © 2015-2018 Arun Prakash Jana <engineerarun@gmail.com>
  2166.        Zhiming Wang <zmwangx@gmail.com>
  2167.        License: GPLv3
  2168.        Webpage: https://github.com/jarun/googler
  2169.        """ % _VERSION_))
  2170.  
  2171.     # Augment print_help to print more than synopsis and options
  2172.     def print_help(self, file=None):
  2173.         super().print_help(file)
  2174.         self.print_omniprompt_help(file)
  2175.         self.print_general_info(file)
  2176.  
  2177.     # Automatically print full help text on error
  2178.     def error(self, message):
  2179.         sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
  2180.         self.print_help(sys.stderr)
  2181.         self.exit(2)
  2182.  
  2183.     # Type guards
  2184.     @staticmethod
  2185.     def positive_int(arg):
  2186.         """Try to convert a string into a positive integer."""
  2187.         try:
  2188.             n = int(arg)
  2189.             assert n > 0
  2190.             return n
  2191.         except (ValueError, AssertionError):
  2192.             raise argparse.ArgumentTypeError('%s is not a positive integer' % arg)
  2193.  
  2194.     @staticmethod
  2195.     def nonnegative_int(arg):
  2196.         """Try to convert a string into a nonnegative integer."""
  2197.         try:
  2198.             n = int(arg)
  2199.             assert n >= 0
  2200.             return n
  2201.         except (ValueError, AssertionError):
  2202.             raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg)
  2203.  
  2204.     @staticmethod
  2205.     def is_duration(arg):
  2206.         """Check if a string is a valid duration accepted by Google.
  2207.  
  2208.        A valid duration is of the form dNUM, where d is a single letter h
  2209.        (hour), d (day), w (week), m (month), or y (year), and NUM is a
  2210.        non-negative integer.
  2211.        """
  2212.         try:
  2213.             if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
  2214.                 raise ValueError
  2215.         except (TypeError, IndexError, ValueError):
  2216.             raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
  2217.         return arg
  2218.  
  2219.     @staticmethod
  2220.     def is_colorstr(arg):
  2221.         """Check if a string is a valid color string."""
  2222.         try:
  2223.             assert len(arg) == 6
  2224.             for c in arg:
  2225.                 assert c in COLORMAP
  2226.         except AssertionError:
  2227.             raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
  2228.         return arg
  2229.  
  2230.  
  2231. # Self-upgrade mechanism
  2232.  
  2233. def system_is_windows():
  2234.     """Checks if the underlying system is Windows (Cygwin included)."""
  2235.     return sys.platform in {'win32', 'cygwin'}
  2236.  
  2237.  
  2238. def download_latest_googler(include_git=False):
  2239.     """Download latest googler to a temp file.
  2240.  
  2241.    By default, the latest released version is downloaded, but if
  2242.    `include_git` is specified, then the latest git master is downloaded
  2243.    instead.
  2244.  
  2245.    Parameters
  2246.    ----------
  2247.    include_git : bool, optional
  2248.        Download from git master. Default is False.
  2249.  
  2250.    Returns
  2251.    -------
  2252.    (git_ref, path): tuple
  2253.         A tuple containing the git reference (either name of the latest
  2254.         tag or SHA of the latest commit) and path to the downloaded
  2255.         file.
  2256.  
  2257.    """
  2258.     import urllib.request
  2259.  
  2260.     if include_git:
  2261.         # Get SHA of latest commit on master
  2262.         request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
  2263.                                          headers={'Accept': 'application/vnd.github.v3.sha'})
  2264.         response = urllib.request.urlopen(request)
  2265.         if response.status != 200:
  2266.             raise http.client.HTTPException(response.reason)
  2267.         git_ref = response.read().decode('utf-8')
  2268.     else:
  2269.         # Get name of latest tag
  2270.         request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE,
  2271.                                          headers={'Accept': 'application/vnd.github.v3+json'})
  2272.         response = urllib.request.urlopen(request)
  2273.         if response.status != 200:
  2274.             raise http.client.HTTPException(response.reason)
  2275.         import json
  2276.         git_ref = json.loads(response.read().decode('utf-8'))[0]['tag_name']
  2277.  
  2278.     # Download googler to a tempfile
  2279.     googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
  2280.     printerr('Downloading %s' % googler_download_url)
  2281.     request = urllib.request.Request(googler_download_url,
  2282.                                      headers={'Accept-Encoding': 'gzip'})
  2283.     import tempfile
  2284.     fd, path = tempfile.mkstemp()
  2285.     atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
  2286.     os.close(fd)
  2287.     with open(path, 'wb') as fp:
  2288.         with urllib.request.urlopen(request) as response:
  2289.             if response.status != 200:
  2290.                 raise http.client.HTTPException(response.reason)
  2291.             payload = response.read()
  2292.             try:
  2293.                 fp.write(gzip.decompress(payload))
  2294.             except OSError:
  2295.                 fp.write(payload)
  2296.     return git_ref, path
  2297.  
  2298.  
  2299. def self_replace(path):
  2300.     """Replace the current script with a specified file.
  2301.  
  2302.    Both paths (the specified path and path to the current script) are
  2303.    resolved to absolute, symlink-free paths. Upon replacement, the
  2304.    owner and mode signatures of the current script are preserved. The
  2305.    caller needs to have the necessary permissions.
  2306.  
  2307.    Replacement won't happen if the specified file is the same
  2308.    (content-wise) as the current script.
  2309.  
  2310.    Parameters
  2311.    ----------
  2312.    path : str
  2313.        Path to the replacement file.
  2314.  
  2315.    Returns
  2316.    -------
  2317.    bool
  2318.        True if replaced, False if skipped (specified file is the same
  2319.        as the current script).
  2320.  
  2321.    """
  2322.     if system_is_windows():
  2323.         raise NotImplementedError('Self upgrade not supported on Windows.')
  2324.  
  2325.     import filecmp
  2326.     import shutil
  2327.  
  2328.     path = os.path.realpath(path)
  2329.     self_path = os.path.realpath(__file__)
  2330.  
  2331.     if filecmp.cmp(path, self_path):
  2332.         return False
  2333.  
  2334.     self_stat = os.stat(self_path)
  2335.     os.chown(path, self_stat.st_uid, self_stat.st_gid)
  2336.     os.chmod(path, self_stat.st_mode)
  2337.  
  2338.     shutil.move(path, self_path)
  2339.     return True
  2340.  
  2341.  
  2342. def self_upgrade(include_git=False):
  2343.     """Perform in-place self-upgrade.
  2344.  
  2345.    Parameters
  2346.    ----------
  2347.    include_git : bool, optional
  2348.        See `download_latest_googler`. Default is False.
  2349.  
  2350.    """
  2351.     git_ref, path = download_latest_googler(include_git=include_git)
  2352.     if self_replace(path):
  2353.         printerr('Upgraded to %s.' % git_ref)
  2354.     else:
  2355.         printerr('Already up to date.')
  2356.  
  2357.  
  2358. # Miscellaneous functions
  2359.  
  2360. def python_version():
  2361.     return '%d.%d.%d' % sys.version_info[:3]
  2362.  
  2363.  
  2364. def https_proxy_from_environment():
  2365.     return os.getenv('https_proxy')
  2366.  
  2367.  
  2368. def parse_proxy_spec(proxyspec):
  2369.     if '://' in proxyspec:
  2370.         pos = proxyspec.find('://')
  2371.         scheme = proxyspec[:pos]
  2372.         proxyspec = proxyspec[pos+3:]
  2373.         if scheme.lower() != 'http':
  2374.             # Only support HTTP proxies.
  2375.             #
  2376.             # In particular, we don't support HTTPS proxies since we
  2377.             # only speak plain HTTP to the proxy server, so don't give
  2378.             # users a false sense of security.
  2379.             raise NotImplementedError('Unsupported proxy scheme %s.' % scheme)
  2380.  
  2381.     if '@' in proxyspec:
  2382.         pos = proxyspec.find('@')
  2383.         user_passwd = urllib.parse.unquote(proxyspec[:pos])
  2384.         # Remove trailing '/' if any
  2385.         host_port = proxyspec[pos+1:].rstrip('/')
  2386.     else:
  2387.         user_passwd = None
  2388.         host_port = proxyspec.rstrip('/')
  2389.  
  2390.     if ':' not in host_port:
  2391.         # Use port 1080 as default, following curl.
  2392.         host_port += ':1080'
  2393.  
  2394.     return user_passwd, host_port
  2395.  
  2396.  
  2397. # Query autocompleter
  2398.  
  2399. # This function is largely experimental and could raise any exception;
  2400. # you should be prepared to catch anything. When it works though, it
  2401. # returns a list of strings the prefix could autocomplete to (however,
  2402. # it is not guaranteed that they start with the specified prefix; for
  2403. # instance, they won't if the specified prefix ends in a punctuation
  2404. # mark.)
  2405. def completer_fetch_completions(prefix):
  2406.     import html
  2407.     import json
  2408.     import re
  2409.     import urllib.request
  2410.  
  2411.     # One can pass the 'hl' query param to specify the language. We
  2412.     # ignore that for now.
  2413.     api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' %
  2414.                urllib.parse.quote(prefix, safe=''))
  2415.     # A timeout of 3 seconds seems to be overly generous already.
  2416.     resp = urllib.request.urlopen(api_url, timeout=3)
  2417.     charset = resp.headers.get_content_charset()
  2418.     logger.debug('Completions charset: %s', charset)
  2419.     respobj = json.loads(resp.read().decode(charset))
  2420.  
  2421.     # The response object, once parsed as JSON, should look like
  2422.     #
  2423.     # ['git',
  2424.     #  [['git<b>hub</b>', 0],
  2425.     #   ['git', 0],
  2426.     #   ['git<b>lab</b>', 0],
  2427.     #   ['git<b> stash</b>', 0]],
  2428.     #  {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}]
  2429.     #
  2430.     # Note the each result entry need not have two members; e.g., for
  2431.     # 'gi', there is an entry ['gi<b>f</b>', 0, [131]].
  2432.     HTML_TAG = re.compile(r'<[^>]+>')
  2433.     return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]]
  2434.  
  2435.  
  2436. def completer_run(prefix):
  2437.     if prefix:
  2438.         completions = completer_fetch_completions(prefix)
  2439.         if completions:
  2440.             print('\n'.join(completions))
  2441.     sys.exit(0)
  2442.  
  2443.  
  2444. def parse_args(args=None, namespace=None):
  2445.     """Parse googler arguments/options.
  2446.  
  2447.    Parameters
  2448.    ----------
  2449.    args : list, optional
  2450.        Arguments to parse. Default is ``sys.argv``.
  2451.    namespace : argparse.Namespace
  2452.        Namespace to write to. Default is a new namespace.
  2453.  
  2454.    Returns
  2455.    -------
  2456.    argparse.Namespace
  2457.        Namespace with parsed arguments / options.
  2458.  
  2459.    """
  2460.  
  2461.     colorstr_env = os.getenv('GOOGLER_COLORS')
  2462.  
  2463.     argparser = GooglerArgumentParser(description='Google from the command-line.')
  2464.     addarg = argparser.add_argument
  2465.     addarg('-s', '--start', type=argparser.nonnegative_int, default=0,
  2466.            metavar='N', help='start at the Nth result')
  2467.     addarg('-n', '--count', dest='num', type=argparser.positive_int,
  2468.            default=10, metavar='N', help='show N results (default 10)')
  2469.     addarg('-N', '--news', action='store_true',
  2470.            help='show results from news section')
  2471.     addarg('-c', '--tld', metavar='TLD',
  2472.            help="""country-specific search with top-level domain .TLD, e.g., 'in'
  2473.           for India""")
  2474.     addarg('-l', '--lang', metavar='LANG', help='display in language LANG')
  2475.     addarg('-x', '--exact', action='store_true',
  2476.            help='disable automatic spelling correction')
  2477.     addarg('-C', '--nocolor', dest='colorize', action='store_false',
  2478.            help='disable color output')
  2479.     addarg('--colors', dest='colorstr', type=argparser.is_colorstr,
  2480.            default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
  2481.            help='set output colors (see man page for details)')
  2482.     addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
  2483.            help='open the first result in web browser and exit')
  2484.     addarg('-t', '--time', dest='duration', type=argparser.is_duration,
  2485.            metavar='dN', help='time limit search '
  2486.            '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
  2487.     addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
  2488.            help='search a site using Google')
  2489.     addarg('--unfilter', action='store_true', help='do not omit similar results')
  2490.     addarg('-p', '--proxy', default=https_proxy_from_environment(),
  2491.            help="""tunnel traffic through an HTTP proxy;
  2492.           PROXY is of the form [http://][user:password@]proxyhost[:port]""")
  2493.     addarg('--noua', action='store_true', help='disable user agent')
  2494.     addarg('--notweak', action='store_true',
  2495.            help='disable TCP optimizations and forced TLS 1.2')
  2496.     addarg('--json', action='store_true',
  2497.            help='output in JSON format; implies --noprompt')
  2498.     addarg('--url-handler', metavar='UTIL',
  2499.            help='custom script or cli utility to open results')
  2500.     addarg('--show-browser-logs', action='store_true',
  2501.            help='do not suppress browser output (stdout and stderr)')
  2502.     addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
  2503.            help='search and exit, do not prompt')
  2504.     addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
  2505.     if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
  2506.         addarg('-u', '--upgrade', action='store_true',
  2507.                help='perform in-place self-upgrade')
  2508.         addarg('--include-git', action='store_true',
  2509.                help='when used with --upgrade, upgrade to latest git master')
  2510.     addarg('-v', '--version', action='version', version=_VERSION_)
  2511.     addarg('-d', '--debug', action='store_true', help='enable debugging')
  2512.     addarg('--complete', help=argparse.SUPPRESS)
  2513.  
  2514.     return argparser.parse_args(args, namespace)
  2515.  
  2516.  
  2517. def main():
  2518.     global ua
  2519.  
  2520.     try:
  2521.         opts = parse_args()
  2522.  
  2523.         # Set logging level
  2524.         if opts.debug:
  2525.             logger.setLevel(logging.DEBUG)
  2526.             logger.debug('googler version %s', _VERSION_)
  2527.             logger.debug('Python version %s', python_version())
  2528.  
  2529.         # Handle query completer
  2530.         if opts.complete is not None:
  2531.             completer_run(opts.complete)
  2532.  
  2533.         # Handle self-upgrade
  2534.         if hasattr(opts, 'upgrade') and opts.upgrade:
  2535.             self_upgrade(include_git=opts.include_git)
  2536.             sys.exit(0)
  2537.  
  2538.         check_stdout_encoding()
  2539.  
  2540.         if opts.keywords:
  2541.             try:
  2542.                 # Add cmdline args to readline history
  2543.                 readline.add_history(' '.join(opts.keywords))
  2544.             except Exception:
  2545.                 pass
  2546.  
  2547.         # Set colors
  2548.         if opts.colorize:
  2549.             colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x'])
  2550.         else:
  2551.             colors = None
  2552.         Result.colors = colors
  2553.         Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False
  2554.         GooglerCmd.colors = colors
  2555.  
  2556.         if opts.url_handler is not None:
  2557.             open_url.url_handler = opts.url_handler
  2558.         else:
  2559.             # Set text browser override to False
  2560.             open_url.override_text_browser = False
  2561.  
  2562.             # Handle browser output suppression
  2563.             if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers):
  2564.                 open_url.suppress_browser_output = False
  2565.             else:
  2566.                 open_url.suppress_browser_output = True
  2567.  
  2568.         if opts.noua:
  2569.             logger.debug('User Agent is disabled')
  2570.             ua = False
  2571.  
  2572.         repl = GooglerCmd(opts)
  2573.  
  2574.         if opts.json or opts.lucky or opts.noninteractive:
  2575.             # Non-interactive mode
  2576.             repl.fetch()
  2577.             if opts.lucky:
  2578.                 if repl.results:
  2579.                     open_url(repl.results[0].url)
  2580.                 else:
  2581.                     print('No results.', file=sys.stderr)
  2582.             else:
  2583.                 repl.display_results(prelude='', json_output=opts.json)
  2584.             sys.exit(0)
  2585.         else:
  2586.             # Interactive mode
  2587.             repl.cmdloop()
  2588.     except Exception as e:
  2589.         # With debugging on, let the exception through for a traceback;
  2590.         # otherwise, only print the exception error message.
  2591.         if logger.isEnabledFor(logging.DEBUG):
  2592.             raise
  2593.         else:
  2594.             logger.error(e)
  2595.             sys.exit(1)
  2596.  
  2597. if __name__ == '__main__':
  2598.     main()
RAW Paste Data