Share Pastebin
Guest
Public paste!

Karthik Chikmagalur

By: a guest | May 20th, 2009 | Syntax: Python | Size: 7.36 KB | Hits: 298 | Expires: Never
Copy text to clipboard
  1. #!/usr/bin/python
  2.  
  3. """Python script to query Google for definitions of words from the command line
  4.  
  5. Usage:
  6. define [-lr] phrase1 [phrase2 phrase3 ...]
  7. Options:
  8.                  -h : Quick help
  9.         -l : Include URLs to sources of definitions
  10.                  -r : Pipe raw HTML to terminal, useful for redirection to browsers
  11.                  -f : (Slightly) Fancy output
  12.                  -u : Display results in Unicode UTF-8
  13. """
  14.  
  15. __author__ = "Karthik Chikmagalur (karthik.chikmagalur@gmail.com)"
  16. __version__ = "$Revision: 0.1 $"
  17. __copyright__ = "(C) Karthik Chikmagalur"
  18. __license_ = "GNU GPL 3"
  19.  
  20. ##############################################################################
  21. # To Do:
  22. # Improve Unicode support
  23. # Port to Windows: minor changes needed
  24. ##############################################################################
  25.  
  26. from httplib import HTTPConnection
  27. from sgmllib import SGMLParser
  28. from optparse import OptionParser
  29. import sys, re, urllib, htmlentitydefs
  30.  
  31. ##############################################################################
  32. #Flags
  33. flag_word={}
  34.  
  35. # Parse command line options:
  36.  
  37. cli_parser = OptionParser( usage="usage: %prog [-rlf] phrase1 [phrase2 phrase3 ...]\n%prog --help for help" )
  38. cli_parser.add_option( "-l", "--links", action="store_true", dest="url",
  39.                                           default=False, help="Include URLs to sources of definitions" )
  40. cli_parser.add_option( "-r", "--raw", action="store_true", dest="raw",
  41.                                           default=False, help="Spout raw HTML, useful for redirection to browsers" )
  42. cli_parser.add_option( "-f", "-F", action="store_true", dest="fancy",
  43.                                           default=False, help="Slightly fancier output, makes it easier to read" )
  44. cli_parser.add_option( "-u", "--unicode", action="store_true", dest="unicode",
  45.                                           default=False, help="Display results in unicode UTF-8" )
  46.  
  47. ( flag, words ) = cli_parser.parse_args(  )
  48. if not words:
  49.         words = sys.stdin.read( ).strip( ).split( " " )
  50. if not words:
  51.         print "usage: define [-rlf] phrase1 [phrase2 phrase3 ...]\ndefine --help for help"
  52.         sys.exit(  )
  53. ##############################################################################
  54. # HTTP Constants
  55.  
  56. GoogleServer = 'www.google.com'
  57. RequestBase  = '/search?q='
  58. DefineKeyword = 'define: '
  59. # Pretending to be Lynx, simplifies HTML returned by Google (?)
  60. Headers   = { 'User-Agent' : 'Lynx/2.8.6rel.4 libwww-FM/2.14' }
  61.  
  62. #Example: http://www.google.co.in/search?hl=en&q=define%3A+pine+tree
  63. ##############################################################################
  64. # Data Retrieval from Google
  65.  
  66. if not words:
  67.         sys.exit( "Usage: define [-lrf] phrase1 [phrase2 phrase3 ...]\ndefine --help for help" )
  68.  
  69. response = {}
  70.  
  71. try:
  72.         connection = HTTPConnection( GoogleServer )
  73.         for word in words:
  74.                 requeststring = RequestBase + urllib.quote_plus( DefineKeyword + word )
  75.                 connection.request ( method = 'GET', url = requeststring, headers = Headers )
  76.                 response[ word ] = connection.getresponse( ).read( ).strip( )
  77. except Exception, exception:
  78.         sys.exit( 'Unable to make request "http://%s/%s" - %s' % ( GoogleServer, requeststring, exception ) )
  79.  
  80. ##############################################################################
  81. # Regular Expressions
  82.  
  83. # This regular expression discards the HTML bits not required- which is most of what is returned.
  84. CoarseMatchEx = re.compile ( r'.*on the Web:</p>(?P<def_section>.*?)'
  85.                                                          r'(<font size=-1><p>|<div style="text-align:center">)',
  86.                                                          re.DOTALL )
  87.  
  88. # Matches <a href="some_url">hypertext</a>, that is, any URL bit.
  89. URLMatchEx = re.compile ( r'<a href=.*?</a>', re.MULTILINE )
  90.  
  91. ##############################################################################
  92. # Data processing with Regular expressions defined above
  93.  
  94. value = {}
  95. for word in words:
  96.         flag_word[ word ] = 1
  97.         coarsematch = CoarseMatchEx.match( response[ word ] )
  98.         if coarsematch:
  99.                 value[ word ] = coarsematch.group( 'def_section' )
  100.         else:
  101.                 # No match => No definitions found.
  102.                 flag_word[ word ] = 0
  103.  
  104. if not flag.url:
  105.         for word in words:
  106.                 # Discard URLs given that definitions are found.
  107.                 if flag_word[ word ]: value[ word ] = URLMatchEx.sub( '', value[ word ], 0 )
  108.                
  109. if flag.raw:
  110.         #Raw output needed, no more processing necessary.
  111.         for word in words:
  112.                 if flag_word[ word ]:
  113.                         print "<b>%s</b>: " % word.upper(  )
  114.                         print value[ word ]
  115.                 else:
  116.                         print "No definitions found for <b>%s</b><br><br>" % word.upper(  )
  117.         sys.exit(  )
  118.  
  119. # Done
  120. ##############################################################################
  121. # HTML processing and conversion to text
  122.  
  123. # Class methods for HTML to Text conversion. Most tags will be discarded,
  124. # some replaced by newlines and asterisks, etc.
  125.  
  126. entitysubs = { 'quot': '"', 'lt' : '<', 'gt' : '>', 'amp' : '&', 'apos' : '\'',
  127.                           'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)',
  128.                           'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
  129.                           'ndash':'-' }
  130. if not flag.unicode:
  131.         charsubs = { 'oelig':'oe', 'aelig':'ae', 'agrave':'a', 'aacute':'a', 'uuml':'u',
  132.                                  'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 'egrave':'e',
  133.                                  'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i',
  134.                                  'icirc':'i', 'iuml':'i', 'ograve':'o', 'oacute':'o', 'ocirc':'o',
  135.                                  'otilde':'o', 'ouml':'o', 'ugrave':'u', 'uacute':'u', 'ucirc':'u' }
  136.  
  137.  
  138. class DefinePageHTML2Text( SGMLParser ):
  139.         def reset( self ):
  140.                 self.ignore_URL = 0
  141.                 self.prefix = ''
  142.                 self.suffix = ''
  143.                 self.pieces = [  ]
  144.                 SGMLParser.reset( self )
  145.                
  146.         def start_br( self, attrs ):
  147.                 self.unknown_starttag( "br", attrs )
  148.                 self.pieces.append( "\n" )
  149.                 self.prefix= ""
  150.                 self.suffix= "\n"
  151.  
  152.         def start_li( self, attrs ):
  153.                 self.unknown_starttag( "li", attrs )
  154.                 self.pieces.append( "\n" )
  155.                 if flag.fancy: self.prefix = "* "
  156.                 self.suffix= ""
  157.  
  158.         def start_a( self, attrs ):
  159.                 self.unknown_starttag( "a", attrs )
  160.                 self.ignore_URL = 1
  161.                 if attrs[ 0 ][ 0 ]== 'href':
  162.                         self.pieces.append( re.match( r'.*&q=(.*?)&usg.*$', attrs[ 0 ][ 1 ] ).group( 1 )+"\n" )
  163.  
  164.         def handle_charref( self, ref ):
  165.                 # Called for each character reference, e.g. for "&#160;", ref will be "160"
  166.                 # Convert ref to unicode string and append to output text
  167.                 if flag.unicode:
  168.                         self.pieces.append( unichr( int( ref ) ).encode( 'utf-8' ) )
  169.                 else:
  170.                         if ref in htmlentitydefs.codepoint2name.keys( ):
  171.                                 self.pieces.append( charsubs[ htmlentitydefs.codepoint2name [ref].lower( ) ] )
  172.                
  173.         def handle_entityref( self, ref ):
  174.                 # called for each entity reference, e.g. for "&copy;", ref will be "copy"
  175.                 # Replace with closest ASCII equivalent if possible
  176.                 if ref in entitysubs.keys( ):
  177.                         self.pieces.append( entitysubs[ ref ] )
  178.                 else:
  179.                         self.pieces.append("&%(ref)s" % locals( ) )
  180.                         if htmlentitydefs.entitydefs.has_key( ref ):
  181.                                 self.pieces.append( ";" )
  182.  
  183.         def handle_data( self, text ):
  184.                 if not self.ignore_URL: self.pieces.extend( [ self.prefix, text, self.suffix ] )
  185.                 self.ignore_URL = 0
  186.                
  187.         def output( self ):
  188.                 """Return processed HTML as a single string"""
  189.                 return "".join( self.pieces )
  190.  
  191. parser = DefinePageHTML2Text(  )
  192.  
  193. #Almost done- Time to feed the parser...
  194. for word in words:
  195.         print str.upper( word )+": ",
  196.         if flag_word[ word ]:
  197.                 parser.feed( value[ word ] )
  198.                 #...and spit the text out.
  199.                 print parser.output(  )
  200.         else:
  201.                 #Ouch.
  202.                 print "No definitions found for %s\n" % word.upper(  )
  203.         parser.reset(  )
  204.  
  205. #And that's that.
  206. ##############################################################################