andriesss
By: a guest | Feb 3rd, 2010 | Syntax:
PHP | Size: 1.55 KB | Hits: 307 | Expires: Never
<?php
// @author Andries Seutens <andries@sanmax.be>
echo '<pre>'; echo htmlToText($html); echo '</pre>';
function htmlToText($html)
{
// normalize newlines (windows vs unix)
// remove javascript
$html = preg_replace('%<script[^>]*>.*?</script>%s', '', $html);
// strip html tags, except for the ones we want to format
$html = strip_tags($html, '<img><p><li><ul><ol><h1><h2><h3>');
// replace pictures with their alt tags
$html = preg_replace('%\<img[^>]*alt="(.*)".*/\>%isU', '$1', $html);
// match list items
'%<li[^>]*>(.*?)</li>%sme',
// match multiple empty lines & spaces
'%\s{3,}%',
// match heading tags
'%<h[1-9][^>]*>(.*?)</h[1-9]>%sme',
// match lists
'%<(?:u|o)l[^>]*>(.*?)</(?:u|o)l>%sme',
// match paragraphs
'%<p[^>]*>(.*?)</p>%sme',
);
// properly format list items
"'\n- '. preg_replace('%\s{2,}%', ' ', '\\1')",
// replaces multiply empty lines with a single line
"\n",
// properly format heading tags
"'\n'.preg_replace('%\s{2,}%', ' ', '\\1')",
// properly format lists
"preg_replace('%\s{2,}%', ' ', '\\1').'\n'",
// properly format paragraphs
"preg_replace('%\s{2,}%', ' ', '\\1').'\n'",
);
$html = strip_tags($html); // strip any leftover tags
}