Data hosted with ♥ by Pastebin.com - Download Raw - See Original
  1. #!/usr/bin/env python
  2. # Created By Abraham Aranguren <name.surname@gmail.com> Twitter: @7a_ http://7-a.org
  3. # Requires lxml, installation instructions here: http://lxml.de/installation.html
  4. # Installation in Backtrack 5: /usr/bin/easy_install --allow-hosts=lxml.de,*.python.org lxml
  5. # Tip for Ubuntu courtesy of Mario Heiderich: Python2.7-dev is needed to compile this lib properly
  6. # Clean HTML reference: http://lxml.de/lxmlhtml.html#cleaning-up-html
  7. # Library documentation: http://lxml.de/api/lxml.html.clean.Cleaner-class.html
  8. from lxml.html.clean import Cleaner, clean_html
  9. import lxml.html
  10. from urlparse import urlparse
  11. ALLOWED_TAGS = ('html', 'body', 'a', 'p', 'h1', 'h2', 'h3', 'h4', 'div', 'span', 'i', 'b', 'u', 'table', 'tbody', 'tr', 'td', 'th', 'strong', 'em', 'sup', 'sub', 'ul', 'ol', 'li')
  12. ALLOWED_URL_SCHEMES = [ 'http', 'https', 'ftp', 'mailto', 'sftp', 'shttp' ]
  13.  
  14. class HTMLSanitiser:
  15.         def __init__(self):
  16.                 self.Cleaner = Cleaner(scripts = False, javascript = False, comments = False, links = False, meta = True, page_structure = False, processing_instructions = False, embedded = False, frames = False, forms = False, annoying_tags = False, remove_unknown_tags = False, safe_attrs_only = True, allow_tags=ALLOWED_TAGS)
  17.  
  18.         def IsValidURL(self, URL):
  19.                 ParsedURL = urlparse(URL)
  20.                 return (ParsedURL.scheme in ALLOWED_URL_SCHEMES)
  21.  
  22.         def CleanURLs(self, HTML):
  23.                 # Largely Inspired from: http://stackoverflow.com/questions/5789127/how-to-replace-links-using-lxml-and-iterlinks
  24.                 ParsedHTML = lxml.html.document_fromstring(HTML)
  25.                 for Element, Attribute, Link, Pos in ParsedHTML.iterlinks():
  26.                         if not self.IsValidURL(Link):
  27.                                 Element.set(Attribute, Link.replace(Link, ''))
  28.                 return lxml.html.tostring(ParsedHTML)
  29.  
  30.         def CleanThirdPartyHTML(self, HTML):
  31.                 # 1st clean URLs, 2nd get rid of basics, 3rd apply white list
  32.                 return self.Cleaner.clean_html(clean_html(self.CleanURLs(HTML)))
  33.  
  34. # For testing as a standalone script:
  35. Sanitiser = HTMLSanitiser()
  36. with open('input.txt') as file:
  37.         print Sanitiser.CleanThirdPartyHTML(file.read())