Advertisement
pellekrogholt

Untitled

Nov 9th, 2012
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.14 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. """
  4. Stone-Age HTML Filter: prepare documents for e-mail distribution.
  5.  
  6.    Copyright (C) 2007 Malthe Borch
  7.  
  8.    This library is free software; you can redistribute it and/or
  9.    modify it under the terms of the GNU Lesser General Public
  10.    License as published by the Free Software Foundation; either
  11.    version 2.1 of the License, or (at your option) any later version.
  12.  
  13.    This library is distributed in the hope that it will be useful,
  14.    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15.    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16.    Lesser General Public License for more details.
  17.  
  18.    You should have received a copy of the GNU Lesser General Public
  19.    License along with this library; if not, write to the Free Software
  20.    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  21.  
  22. usage:
  23.  
  24.  stoneagehtml.compactify(text)
  25.  
  26.  (see function def for details)
  27.  
  28. """
  29.  
  30. from BeautifulSoup import BeautifulSoup
  31. import cssutils
  32. import re
  33.  
  34. # regex: selectors
  35. regex_selector_id = re.compile('((?:\.|#)[\w\-_]+)')
  36. regex_selector = re.compile('(\w+)?(#([\w\-_]+))?(\.([\w\-_]+))?(\*)?')
  37.  
  38. # regex: compound css-tags
  39. regex_tags = {'background': re.compile('^ *((?!url)(?P<color>[#\w]+))? *((?P<image>url *\([^\)]+\)) *'+
  40.                                        '(?P<repeat>(no-)?repeat(-(x|xy|y))?)? *'+
  41.                                        '(?P<attachment>(scroll|fixed))? *'+
  42.                                        '(?P<position>(top|bottom|left|center|right| |[-\w%]+)+)?)?'),
  43.               }
  44.  
  45. # default tag black-list based on Google Mail's style filter
  46. tag_blacklist=['visibility',
  47.                'font-family',
  48.                'height',
  49.                'list-style-image',
  50.                'top', 'bottom', 'left', 'right',
  51.                'z-index',
  52.                'position',
  53.                'background-image', 'background-repeat', 'background-position']
  54.  
  55. import logging
  56. cssutils.log.setLevel(logging.CRITICAL)
  57.  
  58. # CSSUTILS PREFERENCES
  59. cssutils.ser.prefs.keepAllProperties = False
  60. cssutils.ser.prefs.keepComments = False
  61. cssutils.ser.prefs.keepEmptyRules = False
  62. cssutils.ser.prefs.keepUnknownAtRules = False
  63. cssutils.ser.prefs.keepUsedNamespaceRulesOnly = True
  64. cssutils.ser.prefs.resolveVariables = True
  65. cssutils.ser.prefs.validOnly = True
  66.  
  67.  
  68. def trim_dictionary(d):
  69.     for key, value in d.items():
  70.         if not value:
  71.             del d[key]
  72.  
  73.     return d
  74.  
  75. def find_attribute(key, attrs):
  76.     for k, v in attrs:
  77.         if key == k: return v
  78.  
  79.     return None
  80.  
  81. def tagQuery(tag, tag_name, attrs):
  82.     """Custom tag matcher. Takes into account that tags can
  83.    have several classes."""
  84.  
  85.     if tag_name and tag_name != tag.name:
  86.         return False
  87.  
  88.     for key, value in attrs.items():
  89.         tag_attribute_value = find_attribute(key, tag.attrs)
  90.         if not tag_attribute_value:
  91.             return False
  92.  
  93.         if value in tag_attribute_value.split():
  94.             continue
  95.  
  96.         return False
  97.  
  98.     return True
  99.  
  100. def compactify(text, *args, **kwargs):
  101.     return CompactifyingSoup(text).compactify(*args, **kwargs)
  102.  
  103. class CompactifyingSoup(BeautifulSoup):
  104.     class_prefix = 'c'
  105.     id_prefix = 'i'
  106.  
  107.     def compactify(self,
  108.                    abbreviation_enabled=False,
  109.                    styles_in_tags=True,
  110.                    filter_tags=True,
  111.                    expand_css_properties=True, # experimental
  112.                    remove_classnames_and_ids=False,
  113.                    media=(u'screen',),
  114.                    remove_inline_style=True # very experimental if set to False
  115.                    ):
  116.  
  117.         """
  118.        This function processes an HTML-soup with two purposes:
  119.  
  120.        * To reduce the size by abbreviating class names and identifiers and
  121.          removing unused css-declarations
  122.  
  123.        * Degrades the markup detail to provide compatibility with browsers
  124.          and interface which do not support the full CSS ruleset.
  125.  
  126.        This is demonstrated below.
  127.  
  128.        >>> text = \"""
  129.        ... <html>
  130.        ... <head><style>
  131.        ... #a { margin: 0 }
  132.        ... .a { margin: 1em }
  133.        ... span.b { padding: 0 }
  134.        ... div.b { padding: 1em }
  135.        ... @media screen { div.a { top: 0 }}
  136.        ... .c { background: white url(text.gif) no-repeat fixed bottom left !important }
  137.        ... .d { background: url(text.gif) repeat-x 2px -8px }
  138.        ... #a span { display: block }
  139.        ... .a span { display: none }
  140.        ... </style></head>
  141.        ... <body>
  142.        ... <div id='a'>
  143.        ...   <span class='b c'>test</span>
  144.        ...   <div class='d'><!-- nothing here --></div>
  145.        ...   <span>test</span>
  146.        ... </div>
  147.        ... </body>
  148.        ... </html>\"""
  149.  
  150.        >>> print compactify(text, filter_tags=False)
  151.        <BLANKLINE>
  152.        <html>
  153.        <head></head>
  154.        <body>
  155.        <div id=\"a\" style=\"margin: 0\">
  156.        <span class=\"b c\" style=\"padding: 0; background-color: white !important; background-position: bottom left !important; background-image: url(text.gif) !important; background-repeat: no-repeat !important; background-attachment: fixed !important; display: block\">test</span>
  157.        <div class=\"d\" style=\"background-position: 2px -8px; background-image: url(text.gif); background-repeat: repeat-x\"><!-- nothing here --></div>
  158.        <span style=\"display: block\">test</span>
  159.        </div>
  160.        </body>
  161.        </html>
  162.  
  163.        """
  164.  
  165.         # save arguments
  166.         self.filter_tags = filter_tags
  167.         self.expand_css_properties = expand_css_properties
  168.         self.media = media
  169.  
  170.         self.classes = {}
  171.         self.identifiers = {}
  172.  
  173.         #import pdb; pdb.set_trace()
  174.  
  175.         # optimize class identifiers
  176.         count = 0
  177.         for tag in self.findAll():
  178.             class_def = tag.get('class', None)
  179.             id_def = tag.get('id', None)
  180.             if class_def:
  181.                 # convert class-identifiers to abbreviated versions
  182.                 short_names = []
  183.                 for c in class_def.split(' '):
  184.                     name = c.strip()
  185.                     short_name = self.classes.get(name, "%s%s" % (self.class_prefix, count))
  186.                     if not name in self.classes:
  187.                         # store abbr. identifier in dictionary
  188.                         self.classes[name] = short_name
  189.                         count += 1
  190.  
  191.                         short_names.append(short_name)
  192.  
  193.                 if abbreviation_enabled:
  194.                     tag['class'] = ' '.join(short_names)
  195.  
  196.             if id_def:
  197.                 # convert class-identifiers to abbreviated versions
  198.                 short_names = []
  199.                 for c in id_def.split(' '):
  200.                     name = c.strip()
  201.                     short_name = self.identifiers.get(name, "%s%s" % (self.id_prefix, count))
  202.                     if not name in self.identifiers:
  203.                         # store abbr. identifier in dictionary
  204.                         self.identifiers[name] = short_name
  205.                         count += 1
  206.  
  207.                     short_names.append(short_name)
  208.  
  209.                 if abbreviation_enabled:
  210.                     tag['id'] = ' '.join(short_names)
  211.  
  212.         #import pdb; pdb.set_trace()
  213.  
  214.         style_defs = self.findAll('style')
  215.         for style_def in style_defs:
  216.             # assert non-empty
  217.             if not style_def.contents:
  218.                 continue
  219.  
  220.             style = style_def.contents[0]
  221.  
  222.             # remove unused rules
  223.             sheet = cssutils.parseString(style)
  224.             ### INFO: workaround of bug:
  225.             ### http://code.google.com/p/cssutils/issues/detail?id=39
  226.             ### TODO: after bugfix restore to easier to read:
  227.             # sheet.cssRules = self.filterCSSDeclarations(sheet.cssRules)
  228.             #import pdb; pdb.set_trace()
  229.             filtered_cssrules = self.filterCSSDeclarations(sheet.cssRules)
  230.             del sheet.cssRules[:]
  231.             for fcss in filtered_cssrules:
  232.                 sheet.cssRules.append(fcss)
  233.             style = sheet.cssText
  234.             #import pdb; pdb.set_trace()
  235.             # convert identifiers
  236.             if abbreviation_enabled:
  237.                 for name, short_name in self.classes.items():
  238.                     style = style.replace('.%s ' % name, '.%s ' % short_name)
  239.                     style = style.replace('.%s.' % name, '.%s.' % short_name)
  240.                     style = style.replace('.%s,' % name, '.%s,' % short_name)
  241.  
  242.                 for name, short_name in self.identifiers.items():
  243.                     style = style.replace('#%s ' % name, '#%s ' % short_name)
  244.                     style = style.replace('#%s.' % name, '#%s.' % short_name)
  245.                     style = style.replace('#%s,' % name, '#%s,' % short_name)
  246.  
  247.             style_def.contents[0].replaceWith(style)
  248.  
  249.             if styles_in_tags:
  250.                 # distribute styles
  251.                 for rule in sheet.cssRules:
  252.                     self.distributeCSSDeclaration(rule)
  253.  
  254.                 # remove class names and identifiers from tags
  255.                 if remove_classnames_and_ids:
  256.                     for tag in self.findAll():
  257.                         tag.attrs = filter(lambda (key, value): key not in ('class', 'id'),
  258.                                            tag.attrs)
  259.                
  260.                 #import pdb; pdb.set_trace()
  261.                 # remove inline style-declarations
  262.                 if (remove_inline_style):
  263.                     style_def.extract()
  264.  
  265.         #import pdb; pdb.set_trace()
  266.  
  267.         return self.renderContents()
  268.  
  269.     def distributeCSSDeclaration(self, rule):
  270.         if isinstance(rule, cssutils.css.CSSComment):
  271.             return
  272.         elif isinstance(rule, cssutils.css.CSSMediaRule):
  273.             # verify that media is valid
  274.             valid_media = False
  275.             for med in rule.media:
  276.                 if med.mediaText in self.media or med.mediaText == 'all':
  277.                     valid_media = True
  278.                     break
  279.  
  280.             if not valid_media:
  281.                 return
  282.  
  283.             for rul in rule.cssRules:
  284.                 self.distributeCSSDeclaration(rul)
  285.         else:
  286.             for selector in rule.selectorList:
  287.                 # create selector datastructure
  288.                 selectors = []
  289.                 for match in regex_selector.finditer(selector.selectorText):
  290.                     if not match.group(0):
  291.                         continue
  292.  
  293.                     selectors.append(
  294.                         (match.group(1), trim_dictionary({'class': match.group(5),
  295.                                                           'id': match.group(3)})))
  296.  
  297.                 # distribute selector to document
  298.                 self.distributeCSSRule(rule, self, selectors)
  299.  
  300.     def expandProperty(self, style, prop):
  301.         value = style.getPropertyValue(prop)
  302.         important = style.getPropertyPriority(prop)
  303.  
  304.         style.removeProperty(prop)
  305.  
  306.         # handle properties
  307.         regex = regex_tags[prop]
  308.         match = regex.match(value)
  309.  
  310.         if match:
  311.             for p, v in match.groupdict().items():
  312.                 aggregate_property = '-'.join((prop,p))
  313.                 if v is not None:
  314.                     style.setProperty(aggregate_property, v, priority=important)
  315.         else:
  316.             style.setProperty(prop, value)
  317.  
  318.     def distributeCSSRule(self, rule, basetag, selectors):
  319.         tag_name, attrs = selectors[0]
  320.         tags = basetag.findAll(lambda tag: tagQuery(tag, tag_name, attrs))
  321.  
  322.         # walk down all matching paths
  323.         for tag in tags:
  324.             if len(selectors) > 1:
  325.                 # continue matching down this path
  326.                 self.distributeCSSRule(rule, tag, selectors[1:])
  327.             else:
  328.                 # expand properties
  329.                 if self.expand_css_properties:
  330.                     i = 0
  331.                     while i < rule.style.length:
  332.                         prop = rule.style.item(i)
  333.  
  334.                         # check if property is in expand list
  335.                         if prop in regex_tags.keys():
  336.                             self.expandProperty(rule.style, prop)
  337.  
  338.                         i += 1
  339.  
  340.                 # filter out blacklisted properties
  341.                 if self.filter_tags:
  342.                     i = 0
  343.                     while i < rule.style.length:
  344.                         prop = rule.style.item(i)
  345.  
  346.                         # check if property is in blacklist
  347.                         if prop in tag_blacklist:
  348.                             rule.style.removeProperty(prop)
  349.                         else:
  350.                             i += 1
  351.  
  352.                 # format style-declaration
  353.                 style = rule.style.cssText.replace('\n', ' ').strip(' \n\r')
  354.                 while '  ' in style:
  355.                     style = style.replace('  ', ' ')
  356.  
  357.                 # apply to tags
  358.                 attrs = tag.attrs
  359.                 for i in range(len(attrs)):
  360.                     attr, value = attrs[i]
  361.                     if attr.lower() == 'style':
  362.                         if style:
  363.                             attrs[i] = ('style', '%s; %s' % (value, style))
  364.                             style = None
  365.                         break
  366.  
  367.                 if style:
  368.                     attrs.append(('style', style))
  369.  
  370.     def filterCSSDeclarations(self, cssRules):
  371.         rules = []
  372.         for rule in cssRules:
  373.             if isinstance(rule, cssutils.css.CSSComment):
  374.                 continue
  375.  
  376.             if isinstance(rule, cssutils.css.CSSMediaRule):
  377.                 filtered_rules = self.filterCSSDeclarations(rule.cssRules)
  378.  
  379.                 # api requires explicit removal
  380.                 i = 0
  381.                 while i < len(rule.cssRules):
  382.                     r = rule.cssRules[i]
  383.                     if r not in filtered_rules:
  384.                         rule.deleteRule(i)
  385.                     else:
  386.                         i += 1
  387.  
  388.                 rules.append(rule)
  389.                 continue
  390.  
  391.             # only include rules with at least one used selector
  392.             try:
  393.                 selector_list = self.filterCSSDeclaration(rule)
  394.             except:
  395.                 continue
  396.  
  397.             if len(selector_list):
  398.                 rule.selectorList = selector_list
  399.                 rules.append(rule)
  400.  
  401.         return rules
  402.  
  403.     def filterCSSDeclaration(self, rule):
  404.         selector_list = cssutils.css.selectorlist.SelectorList()
  405.         for selector in rule.selectorList:
  406.             # remove unused selectors
  407.             iterator = regex_selector_id.finditer(selector.selectorText)
  408.  
  409.             add = True
  410.             for match in iterator:
  411.                 s = match.group(1)
  412.                 if s.startswith('.') and s[1:] not in self.classes:
  413.                     add = False
  414.                     break
  415.                 elif s.startswith('#') and s[1:] not in self.identifiers:
  416.                     add = False
  417.                     break
  418.  
  419.             if add: selector_list.appendSelector(selector.selectorText)
  420.         return selector_list
  421.  
  422. def _test():
  423.     import doctest
  424.     doctest.testmod()
  425.  
  426. if __name__ == "__main__":
  427.     _test()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement