Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- namespace ZetaUploader.ServerRuntime.Sys
- {
- using System;
- using System.Linq;
- using System.Text.RegularExpressions;
- using ZetaLongPaths;
- /// <summary>
- /// Function to convert HTML source into readable plain text, intended to use in e-mail messages.
- /// </summary>
- /// <remarks>
- /// Changelog:
- ///
- /// - 2024-07-01, Uwe Keim: Doing an HTML-decode at start to avoid e.g. German Umlauts
- /// to be removed.
- /// - 2018-09-23, Uwe Keim: Added ability to keep links visible and not remove them.
- /// - 2018-09-23, Uwe Keim: Added ability to keep HRs as "-------...".
- /// - 2013-05-10, Uwe Keim: Initial release.
- ///
- /// Use it for whatever you want. Some of our tools that use this function include:
- ///
- /// - https://www.zeta-producer.com
- /// - https://www.zeta-uploader.com
- /// - https://www.zeta-test.com
- /// </remarks>
- public static class HtmlToText2
- {
- /// <summary>
- /// Convert a given HTML source code to readable plain text, intented to use in e-mail message.
- /// </summary>
- public static string ConvertHtmlToPlainText(string html)
- {
- if (string.IsNullOrWhiteSpace(html)) return string.Empty;
- // 2024-07-01, Uwe Keim: Umlaute werden geschluckt, deshalb in die literalen Zeichen umwandeln.
- html = System.Net.WebUtility.HtmlDecode(html);
- // http://pastebin.com/NswerNkQ
- // http://stackoverflow.com/questions/8419517/convert-html-to-plain-text-while-preserving-p-br-ul-ol
- // http://www.codeproject.com/KB/HTML/HTML_to_Plain_Text.aspx
- // https://github.com/soundasleep/html2text/blob/master/src/Html2Text.php for keeping links usable.
- // Remove HTML Development formatting
- // Replace line breaks with space
- // because browsers inserts space
- // ReSharper disable LocalizableElement
- var result = html.Replace("\r", @" ");
- // Replace line breaks with space
- // because browsers inserts space
- result = result.Replace("\n", @" ");
- // Remove step-formatting
- result = result.Replace("\t", string.Empty);
- // ReSharper restore LocalizableElement
- // Remove repeating spaces because browsers ignore them
- result = Regex.Replace(result, @"(\s)+", " ", RegexOptions.Singleline);
- // Remove the header (prepare first by clearing attributes)
- result = Regex.Replace(result,
- @"<( )*head([^>])*>", @"<head>",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<( )*(/)( )*head( )*>)", @"</head>",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<head>).*(</head>)", string.Empty,
- RegexOptions.IgnoreCase);
- // remove all scripts (prepare first by clearing attributes)
- result = Regex.Replace(result,
- @"<( )*script([^>])*>", @"<script>",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<( )*(/)( )*script( )*>)", @"</script>",
- RegexOptions.IgnoreCase);
- //result = Regex.Replace(result,
- // @"(<script>)([^(<script>\.</script>)])*(</script>)",
- // string.Empty,
- // RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<script>).*(</script>)", string.Empty,
- RegexOptions.IgnoreCase);
- // remove all styles (prepare first by clearing attributes)
- result = Regex.Replace(result,
- @"<( )*style([^>])*>", @"<style>",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<( )*(/)( )*style( )*>)", @"</style>",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"(<style>).*(</style>)", string.Empty,
- RegexOptions.IgnoreCase);
- // insert tabs in spaces of <td> tags
- result = Regex.Replace(result,
- @"<( )*td([^>])*>", "\t",
- RegexOptions.IgnoreCase);
- // insert line breaks in places of <BR> and <LI> tags
- result = Regex.Replace(result,
- @"<( )*br( )*>", "\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<( )*li( )*>", "\r- ",
- RegexOptions.IgnoreCase);
- // --
- // Keep HRs.
- var hrs = Regex.Matches(
- result, @"<hr.*?>",
- RegexOptions.IgnoreCase)
- .Cast<Match>()
- .Where(m => m.Success);
- foreach (var hr in hrs)
- {
- // Insert inside "<p>" tags to let the following code add
- // the correct number of empty lines.
- result = result.Replace(hr.Value, "<p>---------------------------------------------------------------</p>");
- }
- // --
- // insert line paragraphs (double line breaks) in place
- // if <P>, <DIV> and <TR> tags
- result = Regex.Replace(result,
- @"<( )*div([^>])*>", "\r\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<( )*tr([^>])*>", "\r\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<( )*p([^>])*>", "\r\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<( )*ol([^>])*>", "\r\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<( )*ul([^>])*>", "\r\r",
- RegexOptions.IgnoreCase);
- // --
- // Extract from Anchors.
- var anchors = Regex.Matches(
- result,
- @"<a.*?href\s*=\s*(?:['""])(.+?)(?:['""])>(.*?)</a>",
- RegexOptions.IgnoreCase | RegexOptions.Singleline)
- .Cast<Match>()
- .Where(m => m.Success);
- foreach (var anchor in anchors)
- {
- var link = anchor.Groups[1].Value;
- var text = anchor.Groups[2].Value;
- var coreLink = link;
- var coreText = text;
- var isSame =
- coreLink.EqualsNoCase(coreText) ||
- coreLink.EqualsNoCase($@"mailto:{coreText}") ||
- coreLink.EqualsNoCase($@"http://{coreText}") ||
- coreLink.EqualsNoCase($@"https://{coreText}");
- var replacement = isSame || string.IsNullOrEmpty(coreText) ? coreLink : $@"[{coreText}]({coreLink})";
- result = result.Replace(anchor.Value, replacement);
- }
- // --
- // Remove remaining tags like <a>, links, images,
- // comments etc - anything that's enclosed inside < >
- result = Regex.Replace(result,
- @"<[^>]*>", string.Empty,
- RegexOptions.IgnoreCase);
- // replace special characters:
- result = Regex.Replace(result,
- @" ", " ",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"•", " * ",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"‹", "<",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"›", ">",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"™", "(tm)",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"⁄", "/",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"<", "<",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @">", ">",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"©", "(c)",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- @"®", "(r)",
- RegexOptions.IgnoreCase);
- // Remove all others. More can be added, see
- // http://hotwired.lycos.com/webmonkey/reference/special_characters/
- result = Regex.Replace(result,
- @"&(.{2,6});", string.Empty,
- RegexOptions.IgnoreCase);
- // for testing
- //Regex.Replace(result,
- // this.txtRegex.Text,string.Empty,
- // RegexOptions.IgnoreCase);
- // make line breaking consistent
- // ReSharper disable LocalizableElement
- result = result.Replace("\n", "\r");
- // ReSharper restore LocalizableElement
- // Remove extra line breaks and tabs:
- // replace over 2 breaks with 2 and over 4 tabs with 4.
- // Prepare first to remove any whitespaces in between
- // the escaped characters and remove redundant tabs in between line breaks
- result = Regex.Replace(result,
- "(\r)( )+(\r)", "\r\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- "(\t)( )+(\t)", "\t\t",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- "(\t)( )+(\r)", "\t\r",
- RegexOptions.IgnoreCase);
- result = Regex.Replace(result,
- "(\r)( )+(\t)", "\r\t",
- RegexOptions.IgnoreCase);
- // Remove redundant tabs
- result = Regex.Replace(result,
- "(\r)(\t)+(\r)", "\r\r",
- RegexOptions.IgnoreCase);
- // Remove multiple tabs following a line break with just one tab
- result = Regex.Replace(result,
- "(\r)(\t)+", "\r\t",
- RegexOptions.IgnoreCase);
- // Initial replacement target string for line breaks
- // ReSharper disable LocalizableElement
- var breaks = "\r\r\r";
- // ReSharper restore LocalizableElement
- // Initial replacement target string for tabs
- // ReSharper disable LocalizableElement
- var tabs = "\t\t\t\t\t";
- // ReSharper restore LocalizableElement
- for (var index = 0; index < result.Length; index++)
- {
- // ReSharper disable LocalizableElement
- result = result.Replace(breaks, "\r\r");
- result = result.Replace(tabs, "\t\t\t\t");
- breaks = breaks + "\r";
- tabs = tabs + "\t";
- // ReSharper restore LocalizableElement
- }
- // UK: Space at the beginning.
- // ReSharper disable LocalizableElement
- result = result.Replace("\r ", "\r");
- // ReSharper restore LocalizableElement
- // UK: Normalize.
- // ReSharper disable LocalizableElement
- result = result.Replace("\r", Environment.NewLine);
- // ReSharper restore LocalizableElement
- // That's it.
- return result.Trim();
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement