Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- '''
- HTMLParserEx Script
- Copyright (C) 2015 Amin Paks
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- '''
- import sys, re, urllib, urllib2, httplib, urlparse, xml, time
- from HTMLParser import HTMLParser
- from xml.etree import cElementTree as etree
- class httpConn(object):
- def __init__(self, url='', proxy=False):
- self.url = ['', '', '', '', '', '', '', '', '', '']
- self.proxy = ['', '']
- self.isProxy = proxy
- self.parse( url, connect=False, proxy=proxy )
- def __del__(self):
- self.close()
- def __enter__(self):
- return self
- def __exit__(self, type, value, traceback):
- self.close()
- def parse(self, url='', connect=True, proxy=False):
- urlobj = urlparse.urlparse(url)
- cururl = self.url
- if proxy:
- cururl = self.proxy
- if urlobj[1] and urlobj[1] != cururl[1] and not self.isProxy:
- self.close()
- for idx, x in enumerate(urlobj):
- if idx <= 1 and x and cururl[ idx ] != x:
- cururl[ idx ] = x
- elif idx > 1 and len( cururl ) > idx and cururl[ idx ] != x:
- cururl[ idx ] = x
- if len( cururl ) > 2 and not cururl[2]:
- cururl[2] = '/'
- if proxy:
- self.proxy = cururl
- else:
- self.url = cururl
- if connect:
- self.connect()
- def connect(self, reconnect=False):
- if 'conn' in self.__dict__ and not reconnect:
- return True
- url = self.url
- if self.isProxy:
- url = self.proxy
- if not url[0] or not url[0] in ['http', 'https'] or not url[1]:
- raise Exception('No protocol or host indicated!!')
- if (url[0] == 'http'):
- self.conn = httplib.HTTPConnection( url[1] )
- elif (url[0] == 'https'):
- self.conn = httplib.HTTPSConnection( url[1] )
- else:
- raise Exception('Protocol not supported!')
- def request(self, url='', params={}, headers={}, reqType='GET'):
- self.parse( url )
- try:
- if len( params ):
- params_str = urllib.urlencode( params )
- self.url[3] = params_str
- except:
- pass
- reqUrl = self.url[2]
- if self.isProxy:
- reqUrl = self.url[0] + '://' + self.url[1] + self.url[2]
- if self.url[4]:
- reqUrl += '?' + self.url[4]
- self.conn.request( reqType, reqUrl, self.url[3], headers )
- self.response = self.conn.getresponse()
- return self.response
- def close(self):
- if 'conn' in self.__dict__:
- self.conn.close()
- del self.conn
- class HTMLParserEx(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.tb = etree.TreeBuilder()
- self.reInt = re.compile(r'\D')
- self.reScriptTags = re.compile(r'(<script\b[^>]*>)(.*?)(</script>)', re.DOTALL)
- self.reSelfClosingTags = re.compile(r'((<(img|br|hr)\b.*?)/?>)', re.DOTALL)
- self.reInvalidClosingTags = re.compile(r'</(\w+)\s+.*>', re.DOTALL)
- self.reAllOpeningTags = re.compile(r'(<[A-Z][A-Z0-9]*)\b(.*?)(/?>)', re.DOTALL | re.IGNORECASE)
- self.reTagAttrs = re.compile(r'([_a-z]+[_a-z0-9-]*)(=)["\']{0,1}([^"\']*)', re.DOTALL| re.IGNORECASE)
- def handle_starttag(self, tag, attributes):
- try:
- self.tb.start(tag, dict(attributes))
- except:
- pass
- def handle_endtag(self, tag):
- try:
- self.tb.end(tag)
- except:
- pass
- def handle_data(self, data):
- if data.find('[REPLACE_SCRIPT:') == 0:
- try:
- idx = int( self.reInt.sub( '', data ))
- self.tb.data( self.scripts[ idx ][1] )
- except:
- self.tb.data( data )
- else:
- self.tb.data( data )
- def close(self):
- try:
- HTMLParser.close(self)
- return self.tb.close()
- except:
- return False
- def feed(self, text, encoding='utf-8'):
- self.scripts = self.reScriptTags.findall( text.encode( encoding, 'ignore' ))
- self.scriptIndex = 0
- data = self.reScriptTags.sub( lambda m: self.getScriptReplacement(m), text )
- data = self.reSelfClosingTags.sub(r'\2/>', data)
- data = self.reInvalidClosingTags.sub( r'</\1>', data )
- data = self.reAllOpeningTags.sub( lambda m: self.getCleanOpeningTag(m), data )
- HTMLParser.feed( self, data )
- return data
- def getScriptReplacement(self, match):
- self.scriptIndex += 1
- return match.group(1) + '[REPLACE_SCRIPT:' + str( self.scriptIndex ) + ']' + match.group(3)
- def getCleanOpeningTag(self, match):
- result = u'' + match.group(1)
- for attr in self.reTagAttrs.findall( match.group(2) ):
- result += u' ' + attr[0] + attr[1] + u'"' + attr[2] + u'"'
- result += match.group(3)
- return result
- class cssSelector(object):
- def __init__(self, element, selector='*'):
- try:
- element.set('_doc', True)
- self.root = element
- except:
- self.root = etree.Element('html')
- pass
- self.selector = selector
- self.selected = []
- self.reAttrs = re.compile('^(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)(\*?\|?\^?~?=)?(.+)?$')
- def parseSelector(self, selector):
- result = []
- select = {}
- idx = 'tag'
- for i, letter in enumerate( selector ):
- if idx == 'attr' and not letter in ['[', ']']:
- pass
- elif letter == '#':
- idx = 'id'
- continue
- elif letter == '.':
- idx = 'class'
- if idx in select:
- select[ idx ] += ','
- continue
- elif letter == '[':
- idx = 'attr'
- if idx in select:
- select[ idx ] += '|||'
- continue
- elif letter == ']':
- idx = 'tag'
- continue
- elif letter == ':':
- idx = 'pseudo'
- continue
- elif letter in [' ']:
- idx = 'tag'
- result.append( select )
- select = {}
- continue
- if not idx in select:
- select[ idx ] = ''
- select[ idx ] += letter
- #print letter, idx, select
- if len( select ):
- result.append( select )
- return result
- def domSelector(self, element, select):
- result = []
- try:
- element.getchildren()
- except:
- return result
- tagCondition = True
- idCondition = True
- classCondition = True
- attrCondition = True
- for item in select:
- if 'tag' in item:
- tagCondition = element.tag.lower() == item[ 'tag' ].lower()
- if 'id' in item:
- idCondition = element.get('id', '').lower() == item[ 'id' ].lower()
- if 'class' in item:
- classes = item[ 'class' ].lower().split(',')
- tagClasses = element.get('class', '').lower().split(' ')
- intersect = []
- for c in tagClasses:
- for cc in classes:
- if c == cc: intersect.append( c )
- classCondition = bool(len( intersect ))
- if 'attr' in item:
- attrs = item[ 'attr' ].split('|||')
- for attr in attrs:
- attrDict = self.reAttrs.match(attr)
- if not attrDict:
- continue
- attrDict = attrDict.groups()
- elAttrValue = element.get( attrDict[0].lower(), '' )
- if not elAttrValue:
- attrCondition = False
- continue
- elif attrDict[1] == '*=':
- attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) >=0
- elif attrDict[1] in ['^=', '|=']:
- attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) ==0
- elif attrDict[1] == '=':
- attrCondition = elAttrValue.lower() == attrDict[2].lower()
- if tagCondition and idCondition and classCondition and attrCondition:
- result.append( element )
- for child in element.getchildren():
- result += self.domSelector( child, select )
- return result
- def find(self, selector=None, elements=None):
- if selector is None:
- selector = self.selector
- select = self.parseSelector( selector )
- result = []
- if not elements:
- if len( self.selected ):
- elements = self.selected
- else:
- elements = [self.root]
- elif not type(elements) is list:
- elements = [elements]
- #print select, elements, self.root
- for el in elements:
- result += self.domSelector( el, select )
- if len( result ):
- self.selected = result
- return bool(len( result ))
- @classmethod
- def toString(cls, element, encoding='utf-8'):
- try:
- return etree.tostring( element, encoding )
- except:
- return False
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement