Advertisement
Guest User

HTMLParserEx

a guest
Apr 11th, 2015
263
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.93 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. '''
  4.    HTMLParserEx Script
  5.    Copyright (C) 2015 Amin Paks
  6.  
  7.    This program is free software: you can redistribute it and/or modify
  8.    it under the terms of the GNU General Public License as published by
  9.    the Free Software Foundation, either version 3 of the License, or
  10.    (at your option) any later version.
  11.  
  12.    This program is distributed in the hope that it will be useful,
  13.    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14.    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15.    GNU General Public License for more details.
  16.  
  17.    You should have received a copy of the GNU General Public License
  18.    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19. '''
  20.  
  21. import sys, re, urllib, urllib2, httplib, urlparse, xml, time
  22. from HTMLParser import HTMLParser
  23. from xml.etree import cElementTree as etree
  24.  
  25. class httpConn(object):
  26.   def __init__(self, url='', proxy=False):
  27.     self.url = ['', '', '', '', '', '', '', '', '', '']
  28.     self.proxy = ['', '']
  29.     self.isProxy = proxy
  30.     self.parse( url, connect=False, proxy=proxy )
  31.  
  32.   def __del__(self):
  33.     self.close()
  34.  
  35.   def __enter__(self):
  36.     return self
  37.  
  38.   def __exit__(self, type, value, traceback):
  39.     self.close()
  40.  
  41.   def parse(self, url='', connect=True, proxy=False):
  42.     urlobj = urlparse.urlparse(url)
  43.  
  44.     cururl = self.url
  45.  
  46.     if proxy:
  47.       cururl = self.proxy
  48.  
  49.     if urlobj[1] and urlobj[1] != cururl[1] and not self.isProxy:
  50.       self.close()
  51.  
  52.     for idx, x in enumerate(urlobj):
  53.       if idx <= 1 and x and cururl[ idx ] != x:
  54.         cururl[ idx ] = x
  55.       elif idx > 1 and len( cururl ) > idx and cururl[ idx ] != x:
  56.         cururl[ idx ] = x
  57.  
  58.     if len( cururl ) > 2 and not cururl[2]:
  59.       cururl[2] = '/'
  60.  
  61.     if proxy:
  62.       self.proxy = cururl
  63.     else:
  64.       self.url = cururl
  65.  
  66.     if connect:
  67.       self.connect()
  68.  
  69.   def connect(self, reconnect=False):
  70.     if 'conn' in self.__dict__ and not reconnect:
  71.       return True
  72.  
  73.     url = self.url
  74.  
  75.     if self.isProxy:
  76.       url = self.proxy
  77.  
  78.     if not url[0] or not url[0] in ['http', 'https'] or not url[1]:
  79.       raise Exception('No protocol or host indicated!!')
  80.  
  81.     if (url[0] == 'http'):
  82.       self.conn = httplib.HTTPConnection( url[1] )
  83.     elif (url[0] == 'https'):
  84.       self.conn = httplib.HTTPSConnection( url[1] )
  85.     else:
  86.       raise Exception('Protocol not supported!')
  87.  
  88.   def request(self, url='', params={}, headers={}, reqType='GET'):
  89.     self.parse( url )
  90.  
  91.     try:
  92.       if len( params ):
  93.         params_str = urllib.urlencode( params )
  94.         self.url[3] = params_str
  95.     except:
  96.       pass
  97.  
  98.     reqUrl = self.url[2]
  99.  
  100.     if self.isProxy:
  101.       reqUrl = self.url[0] + '://' + self.url[1] + self.url[2]
  102.  
  103.     if self.url[4]:
  104.       reqUrl += '?' + self.url[4]
  105.  
  106.     self.conn.request( reqType, reqUrl, self.url[3], headers )
  107.     self.response = self.conn.getresponse()
  108.     return self.response
  109.  
  110.   def close(self):
  111.     if 'conn' in self.__dict__:
  112.       self.conn.close()
  113.       del self.conn
  114.  
  115. class HTMLParserEx(HTMLParser):
  116.   def __init__(self):
  117.     HTMLParser.__init__(self)
  118.     self.tb = etree.TreeBuilder()
  119.     self.reInt = re.compile(r'\D')
  120.     self.reScriptTags = re.compile(r'(<script\b[^>]*>)(.*?)(</script>)', re.DOTALL)
  121.     self.reSelfClosingTags = re.compile(r'((<(img|br|hr)\b.*?)/?>)', re.DOTALL)
  122.     self.reInvalidClosingTags = re.compile(r'</(\w+)\s+.*>', re.DOTALL)
  123.  
  124.     self.reAllOpeningTags = re.compile(r'(<[A-Z][A-Z0-9]*)\b(.*?)(/?>)', re.DOTALL | re.IGNORECASE)
  125.  
  126.     self.reTagAttrs = re.compile(r'([_a-z]+[_a-z0-9-]*)(=)["\']{0,1}([^"\']*)', re.DOTALL| re.IGNORECASE)
  127.  
  128.   def handle_starttag(self, tag, attributes):
  129.     try:
  130.       self.tb.start(tag, dict(attributes))
  131.     except:
  132.       pass
  133.  
  134.   def handle_endtag(self, tag):
  135.     try:
  136.       self.tb.end(tag)
  137.     except:
  138.       pass
  139.  
  140.   def handle_data(self, data):
  141.     if data.find('[REPLACE_SCRIPT:') == 0:
  142.       try:
  143.         idx = int( self.reInt.sub( '', data ))
  144.         self.tb.data( self.scripts[ idx ][1] )
  145.       except:
  146.         self.tb.data( data )
  147.     else:
  148.       self.tb.data( data )
  149.  
  150.   def close(self):
  151.     try:
  152.       HTMLParser.close(self)
  153.       return self.tb.close()
  154.     except:
  155.       return False
  156.  
  157.   def feed(self, text, encoding='utf-8'):
  158.     self.scripts = self.reScriptTags.findall( text.encode( encoding, 'ignore' ))
  159.     self.scriptIndex = 0
  160.  
  161.     data = self.reScriptTags.sub( lambda m: self.getScriptReplacement(m), text )
  162.     data = self.reSelfClosingTags.sub(r'\2/>', data)
  163.     data = self.reInvalidClosingTags.sub( r'</\1>', data )
  164.     data = self.reAllOpeningTags.sub( lambda m: self.getCleanOpeningTag(m), data )
  165.  
  166.     HTMLParser.feed( self, data )
  167.  
  168.     return data
  169.  
  170.   def getScriptReplacement(self, match):
  171.     self.scriptIndex += 1
  172.     return match.group(1) + '[REPLACE_SCRIPT:' + str( self.scriptIndex ) + ']' + match.group(3)
  173.  
  174.   def getCleanOpeningTag(self, match):
  175.     result = u'' + match.group(1)
  176.     for attr in self.reTagAttrs.findall( match.group(2) ):
  177.       result += u' ' + attr[0] + attr[1] + u'"' + attr[2] + u'"'
  178.     result += match.group(3)
  179.     return result
  180.  
  181. class cssSelector(object):
  182.   def __init__(self, element, selector='*'):
  183.  
  184.     try:
  185.       element.set('_doc', True)
  186.       self.root = element
  187.     except:
  188.       self.root = etree.Element('html')
  189.       pass
  190.  
  191.     self.selector = selector
  192.     self.selected = []
  193.     self.reAttrs = re.compile('^(-?[_a-zA-Z]+[_a-zA-Z0-9-]*)(\*?\|?\^?~?=)?(.+)?$')
  194.  
  195.   def parseSelector(self, selector):
  196.     result = []
  197.     select = {}
  198.  
  199.     idx = 'tag'
  200.  
  201.     for i, letter in enumerate( selector ):
  202.       if idx == 'attr' and not letter in ['[', ']']:
  203.         pass
  204.       elif letter == '#':
  205.         idx = 'id'
  206.         continue
  207.  
  208.       elif letter == '.':
  209.         idx = 'class'
  210.         if idx in select:
  211.           select[ idx ] += ','
  212.         continue
  213.  
  214.       elif letter == '[':
  215.         idx = 'attr'
  216.         if idx in select:
  217.           select[ idx ] += '|||'
  218.         continue
  219.  
  220.       elif letter == ']':
  221.         idx = 'tag'
  222.         continue
  223.  
  224.       elif letter == ':':
  225.         idx = 'pseudo'
  226.         continue
  227.  
  228.       elif letter in [' ']:
  229.         idx = 'tag'
  230.         result.append( select )
  231.         select = {}
  232.         continue
  233.  
  234.       if not idx in select:
  235.         select[ idx ] = ''
  236.  
  237.       select[ idx ] += letter
  238.  
  239.       #print letter, idx, select
  240.  
  241.     if len( select ):
  242.       result.append( select )
  243.  
  244.     return result
  245.  
  246.   def domSelector(self, element, select):
  247.     result = []
  248.  
  249.     try:
  250.       element.getchildren()
  251.     except:
  252.       return result
  253.  
  254.     tagCondition = True
  255.     idCondition = True
  256.     classCondition = True
  257.     attrCondition = True
  258.  
  259.     for item in select:
  260.       if 'tag' in item:
  261.         tagCondition = element.tag.lower() == item[ 'tag' ].lower()
  262.  
  263.       if 'id' in item:
  264.         idCondition = element.get('id', '').lower() == item[ 'id' ].lower()
  265.  
  266.       if 'class' in item:
  267.         classes = item[ 'class' ].lower().split(',')
  268.         tagClasses = element.get('class', '').lower().split(' ')
  269.         intersect = []
  270.         for c in tagClasses:
  271.           for cc in classes:
  272.             if c == cc: intersect.append( c )
  273.  
  274.         classCondition = bool(len( intersect ))
  275.  
  276.       if 'attr' in item:
  277.         attrs = item[ 'attr' ].split('|||')
  278.         for attr in attrs:
  279.           attrDict = self.reAttrs.match(attr)
  280.           if not attrDict:
  281.             continue
  282.  
  283.           attrDict = attrDict.groups()
  284.           elAttrValue = element.get( attrDict[0].lower(), '' )
  285.  
  286.           if not elAttrValue:
  287.             attrCondition = False
  288.             continue
  289.          
  290.           elif attrDict[1] == '*=':
  291.             attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) >=0
  292.           elif attrDict[1] in ['^=', '|=']:
  293.             attrCondition = elAttrValue.lower().find( attrDict[2].lower() ) ==0
  294.           elif attrDict[1] == '=':
  295.             attrCondition = elAttrValue.lower() == attrDict[2].lower()
  296.  
  297.     if tagCondition and idCondition and classCondition and attrCondition:
  298.       result.append( element )
  299.  
  300.     for child in element.getchildren():
  301.       result += self.domSelector( child, select )
  302.  
  303.     return result
  304.  
  305.   def find(self, selector=None, elements=None):
  306.     if selector is None:
  307.       selector = self.selector
  308.  
  309.     select = self.parseSelector( selector )
  310.  
  311.     result = []
  312.  
  313.     if not elements:
  314.       if len( self.selected ):
  315.         elements = self.selected
  316.       else:
  317.         elements = [self.root]
  318.     elif not type(elements) is list:
  319.       elements = [elements]
  320.  
  321.     #print select, elements, self.root
  322.  
  323.     for el in elements:
  324.       result += self.domSelector( el, select )
  325.  
  326.     if len( result ):
  327.       self.selected = result
  328.  
  329.     return bool(len( result ))
  330.  
  331.   @classmethod
  332.   def toString(cls, element, encoding='utf-8'):
  333.     try:
  334.       return etree.tostring( element, encoding )
  335.     except:
  336.       return False
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement