HTMLParserEx

# -*- coding: utf-8 -*-

'''
    HTMLParserEx Script
    Copyright (C) 2015 Amin Paks

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''

import sys, re, urllib, urllib2, httplib, urlparse, xml, time
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree

class httpConn(object):
  def __init__(self, url='', proxy=False):
    self.url = ['', '', '', '', '', '', '', '', '', '']
    self.proxy = ['', '']
    self.isProxy = proxy
    self.parse( url, connect=False, proxy=proxy )

  def __del__(self):
    self.close()

  def __enter__(self):
    return self

  def __exit__(self, type, value, traceback):
    self.close()

  def parse(self, url='', connect=True, proxy=False):
    urlobj = urlparse.urlparse(url)

    cururl = self.url

    if proxy:
      cururl = self.proxy

    if urlobj[1] and urlobj[1] != cururl[1] and not self.isProxy:
      self.close()

    for idx, x in enumerate(urlobj):
      if idx <= 1 and x and cururl[ idx ] != x:
        cururl[ idx ] = x
      elif idx > 1 and len( cururl ) > idx and cururl[ idx ] != x:
        cururl[ idx ] = x

    if len( cururl ) > 2 and not cururl[2]:
      cururl[2] = '/'

    if proxy:
      self.proxy = cururl
    else:
      self.url = cururl

    if connect:
      self.connect()

  def connect(self, reconnect=False):
    if 'conn' in self.__dict__ and not reconnect:
      return True

    url = self.url

    if self.isProxy:
      url = self.proxy

    if not url[0] or not url[0] in ['http', 'https'] or not url[1]:
      raise Exception('No protocol or host indicated!!')

    if (url[0] == 'http'):
      self.conn = httplib.HTTPConnection( url[1] )
    elif (url[0] == 'https'):
      self.conn = httplib.HTTPSConnection( url[1] )
    else:
      raise Exception('Protocol not supported!')

  def request(self, url='', params={}, headers={}, reqType='GET'):
    self.parse( url )

    try:
      if len( params ):
        params_str = urllib.urlencode( params )
        self.url[3] = params_str
    except:
      pass

    reqUrl = self.url[2]

    if self.isProxy:
      reqUrl = self.url[0] + '://' + self.url[1] + self.url[2]

    if self.url[4]:
      reqUrl += '?' + self.url[4]

    self.conn.request( reqType, reqUrl, self.url[3], headers )
    self.response = self.conn.getresponse()
    return self.response

  def close(self):
    if 'conn' in self.__dict__:
      self.conn.close()
      del self.conn

class HTMLParserEx(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.tb = etree.TreeBuilder()
    self.reInt = re.compile(r'\D')
    self.reScriptTags = re.compile(r'(<script\b[^>]*>)(.*?)(</script>)', re.DOTALL)
    self.reSelfClosingTags = re.compile(r'((<(img|br|hr)\b.*?)/?>)', re.DOTALL)
    self.reInvalidClosingTags = re.compile(r'</(\w+)\s+.*>', re.DOTALL)

    self.reAllOpeningTags = re.compile(r'(<[A-Z][A-Z0-9]*)\b(.*?)(/?>)', re.DOTALL | re.IGNORECASE)

    self.reTagAttrs = re.compile(r'([_a-z]+[_a-z0-9-]*)(=)["\']{0,1}([^"\']*)', re.DOTALL| re.IGNORECASE)

  def handle_starttag(self, tag, attributes):
    try:
      self.tb.start(tag, dict(attributes))
    except:
      pass

  def handle_endtag(self, tag):
    try:
      self.tb.end(tag)
    except:
      pass

  def handle_data(self, data):
    if data.find('[REPLACE_SCRIPT:') == 0:
      try:
        idx = int( self.reInt.sub( '', data ))
        self.tb.data( self.scripts[ idx ][1] )
      except:
        self.tb.data( data )
    else:
      self.tb.data( data )

  def close(self):
    try:
      HTMLParser.close(self)
      return self.tb.close()
    except:
      return False

  def feed(self, text, encoding='utf-8'):
    self.scripts = self.reScriptTags.findall( text.encode( encoding, 'ignore' ))
    self.scriptIndex = 0

    data = self.reScriptTags.sub( lambda m: self.getScriptReplacement(m), text )
    data = self.reSelfClosingTags.sub(r'\2/>', data)
    data = self.reInvalidClosingTags.sub( r'</\1>', data )
    data = self.reAllOpeningTags.sub( lambda m: self.getCleanOpeningTag(m), data )

    HTMLParser.feed( self, data )

    return data

  def getScriptReplacement(self, match):
    self.scriptIndex += 1
    return match.group(1) + '[REPLACE_SCRIPT:' + str( self.scriptIndex ) + ']' + match.group(3)

  def getCleanOpeningTag(self, match):
    result = u'' + match.group(1)
    for attr in self.reTagAttrs.findall( match.group(2) ):
      result += u' ' + attr[0] + attr[1] + u'"' + attr[2] + u'"'
    result += match.group(3)
    return result

class cssSelector(object):
  def __init__(self, element, selector='*'):

    try:
      element.set('_doc', True)
      self.root = element
    except:
      self.root = etree.Element('html')
      pass

    self.selector = selector
    self.selected = []
    self.reAttrs = re.compile('^(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)(\*?\|?\^?~?=)?(.+)?$')

  def parseSelector(self, selector):
    result = []
    select = {}

    idx = 'tag'

    for i, letter in enumerate( selector ):
      if idx == 'attr' and not letter in ['[', ']']:
        pass
      elif letter == '#':
        idx = 'id'
        continue

      elif letter == '.':
        idx = 'class'
        if idx in select:
          select[ idx ] += ','
        continue

      elif letter == '[':
        idx = 'attr'
        if idx in select:
          select[ idx ] += '|||'
        continue

      elif letter == ']':
        idx = 'tag'
        continue

      elif letter == ':':
        idx = 'pseudo'
        continue

      elif letter in [' ']:
        idx = 'tag'
        result.append( select )
        select = {}
        continue

      if not idx in select:
        select[ idx ] = ''

      select[ idx ] += letter

      #print letter, idx, select

    if len( select ):
      result.append( select )

    return result

  def domSelector(self, element, select):
    result = []

    try:
      element.getchildren()
    except:
      return result

    tagCondition = True
    idCondition = True
    classCondition = True
    attrCondition = True

    for item in select:
      if 'tag' in item:
        tagCondition = element.tag.lower() == item[ 'tag' ].lower()

      if 'id' in item:
        idCondition = element.get('id', '').lower() == item[ 'id' ].lower()

      if 'class' in item:
        classes = item[ 'class' ].lower().split(',')
        tagClasses = element.get('class', '').lower().split(' ')
        intersect = []
        for c in tagClasses:
          for cc in classes:
            if c == cc: intersect.append( c )

        classCondition = bool(len( intersect ))

      if 'attr' in item:
        attrs = item[ 'attr' ].split('|||')
        for attr in attrs:
          attrDict = self.reAttrs.match(attr)
          if not attrDict:
            continue

          attrDict = attrDict.groups()
          elAttrValue = element.get( attrDict[0].lower(), '' )

          if not elAttrValue:
            attrCondition = False
            continue

          elif attrDict[1] == '*=':
            attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) >=0
          elif attrDict[1] in ['^=', '|=']:
            attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) ==0
          elif attrDict[1] == '=':
            attrCondition = elAttrValue.lower() == attrDict[2].lower()

    if tagCondition and idCondition and classCondition and attrCondition:
      result.append( element )

    for child in element.getchildren():
      result += self.domSelector( child, select )

    return result

  def find(self, selector=None, elements=None):
    if selector is None:
      selector = self.selector

    select = self.parseSelector( selector )

    result = []

    if not elements:
      if len( self.selected ):
        elements = self.selected
      else:
        elements = [self.root]
    elif not type(elements) is list:
      elements = [elements]

    #print select, elements, self.root

    for el in elements:
      result += self.domSelector( el, select )

    if len( result ):
      self.selected = result

    return bool(len( result ))

  @classmethod
  def toString(cls, element, encoding='utf-8'):
    try:
      return etree.tostring( element, encoding )
    except:
      return False