This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Mike

By: a guest on Nov 10th, 2009  |  syntax: Perl  |  size: 1.70 KB  |  views: 154  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/perl
  2. use warnings;
  3. use strict;
  4.  
  5. use HTML::TreeBuilder;
  6. use CSS::DOM::Style;
  7.  
  8. my $html = <<HTML;
  9. <p style="text-align:center"><span style="font-weight:bold;font-style:italic;">Here's some text here.</span></p>
  10. <p><span style="text-decoration:underline;">And some more.</span> Yet even more!</p>
  11. HTML
  12.  
  13. my $tb = HTML::TreeBuilder->new_from_content($html);
  14.  
  15.  
  16. my @replacements = (
  17.     { property => 'font-style', value => 'italic', replacement => 'em' },
  18.     { property => 'font-weight', value => 'bold', replacement => 'strong' },
  19.     { property => 'text-align', value => 'center', replacement => 'center' },
  20.     { property => 'text-decoration', value => 'underline', replacement => 'u' },
  21. );
  22.  
  23. # build a sensible list of tag names (or just use sub { 1 })
  24. my @nodes = $tb->look_down(sub { $_[0]->tag =~ /^(p|span)$/ });
  25.  
  26. for my $el (@nodes) {
  27.     if ($el->attr('style')) {
  28.         my $st = CSS::DOM::Style::parse($el->attr('style'));
  29.         if ($st) {
  30.             foreach my $h (@replacements) {
  31.                 if ($st->getPropertyValue($h->{property}) eq $h->{value}) {
  32.                     $st->removeProperty($h->{property});
  33.                     my $new = HTML::Element->new($h->{replacement});
  34.                     foreach my $inner ($el->detach_content) {
  35.                         $new->push_content($inner);
  36.                     }
  37.                     $el->push_content($new);
  38.                 }
  39.             }
  40.             $el->attr('style', $st->cssText ? $st->cssText : undef);
  41.         }
  42.     }
  43. }
  44.  
  45. my $src = $tb->as_HTML(undef, "\t", {}); #added empty hashref, it replaced the ending </p> that as_HTML cut off.
  46. $src =~ s{<p.*?>}{<p>}gs;
  47. $src =~ s{</?(div|span).*?>}{}gs;
  48.  
  49. print $src;
clone this paste RAW Paste Data