Convert HTML to plain text

namespace ZetaUploader.ServerRuntime.Sys
{
    using System;
    using System.Linq;
    using System.Text.RegularExpressions;
    using ZetaLongPaths;

    /// <summary>
    /// Function to convert HTML source into readable plain text, intended to use in e-mail messages.
    /// </summary>
    /// <remarks>
    /// Changelog:
    ///
    /// - 2024-07-01, Uwe Keim: Doing an HTML-decode at start to avoid e.g. German Umlauts
    ///                         to be removed.
    /// - 2018-09-23, Uwe Keim: Added ability to keep links visible and not remove them.
    /// - 2018-09-23, Uwe Keim: Added ability to keep HRs as "-------...".
    /// - 2013-05-10, Uwe Keim: Initial release.
    ///
    /// Use it for whatever you want. Some of our tools that use this function include:
    ///
    /// - https://www.zeta-producer.com
    /// - https://www.zeta-uploader.com
    /// - https://www.zeta-test.com
    /// </remarks>
    public static class HtmlToText2
    {
        /// <summary>
        /// Convert a given HTML source code to readable plain text, intented to use in e-mail message.
        /// </summary>
        public static string ConvertHtmlToPlainText(string html)
        {
            if (string.IsNullOrWhiteSpace(html)) return string.Empty;

            // 2024-07-01, Uwe Keim: Umlaute werden geschluckt, deshalb in die literalen Zeichen umwandeln.
            html = System.Net.WebUtility.HtmlDecode(html);

            // http://pastebin.com/NswerNkQ
            // http://stackoverflow.com/questions/8419517/convert-html-to-plain-text-while-preserving-p-br-ul-ol
            // http://www.codeproject.com/KB/HTML/HTML_to_Plain_Text.aspx
            // https://github.com/soundasleep/html2text/blob/master/src/Html2Text.php for keeping links usable.

            // Remove HTML Development formatting
            // Replace line breaks with space
            // because browsers inserts space
            // ReSharper disable LocalizableElement
            var result = html.Replace("\r", @" ");
            // Replace line breaks with space
            // because browsers inserts space
            result = result.Replace("\n", @" ");
            // Remove step-formatting
            result = result.Replace("\t", string.Empty);
            // ReSharper restore LocalizableElement
            // Remove repeating spaces because browsers ignore them
            result = Regex.Replace(result, @"(\s)+", " ", RegexOptions.Singleline);

            // Remove the header (prepare first by clearing attributes)
            result = Regex.Replace(result,
                        @"<( )*head([^>])*>", @"<head>",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<( )*(/)( )*head( )*>)", @"</head>",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<head>).*(</head>)", string.Empty,
                        RegexOptions.IgnoreCase);

            // remove all scripts (prepare first by clearing attributes)
            result = Regex.Replace(result,
                        @"<( )*script([^>])*>", @"<script>",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<( )*(/)( )*script( )*>)", @"</script>",
                        RegexOptions.IgnoreCase);
            //result = Regex.Replace(result,
            //         @"(<script>)([^(<script>\.</script>)])*(</script>)",
            //         string.Empty,
            //         RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<script>).*(</script>)", string.Empty,
                        RegexOptions.IgnoreCase);

            // remove all styles (prepare first by clearing attributes)
            result = Regex.Replace(result,
                        @"<( )*style([^>])*>", @"<style>",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<( )*(/)( )*style( )*>)", @"</style>",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"(<style>).*(</style>)", string.Empty,
                        RegexOptions.IgnoreCase);

            // insert tabs in spaces of <td> tags
            result = Regex.Replace(result,
                        @"<( )*td([^>])*>", "\t",
                        RegexOptions.IgnoreCase);

            // insert line breaks in places of <BR> and <LI> tags
            result = Regex.Replace(result,
                        @"<( )*br( )*>", "\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"<( )*li( )*>", "\r- ",
                        RegexOptions.IgnoreCase);

            // --
            // Keep HRs.

            var hrs = Regex.Matches(
                    result, @"<hr.*?>",
                    RegexOptions.IgnoreCase)
                .Cast<Match>()
                .Where(m => m.Success);

            foreach (var hr in hrs)
            {
                // Insert inside "<p>" tags to let the following code add
                // the correct number of empty lines.
                result = result.Replace(hr.Value, "<p>---------------------------------------------------------------</p>");
            }

            // --

            // insert line paragraphs (double line breaks) in place
            // if <P>, <DIV> and <TR> tags
            result = Regex.Replace(result,
                        @"<( )*div([^>])*>", "\r\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"<( )*tr([^>])*>", "\r\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"<( )*p([^>])*>", "\r\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"<( )*ol([^>])*>", "\r\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"<( )*ul([^>])*>", "\r\r",
                        RegexOptions.IgnoreCase);

            // --
            // Extract from Anchors.

            var anchors = Regex.Matches(
                    result,
                    @"<a.*?href\s*=\s*(?:['""])(.+?)(?:['""])>(.*?)</a>",
                    RegexOptions.IgnoreCase | RegexOptions.Singleline)
                .Cast<Match>()
                .Where(m => m.Success);

            foreach (var anchor in anchors)
            {
                var link = anchor.Groups[1].Value;
                var text = anchor.Groups[2].Value;

                var coreLink = link;
                var coreText = text;

                var isSame =
                    coreLink.EqualsNoCase(coreText) ||
                    coreLink.EqualsNoCase($@"mailto:{coreText}") ||
                    coreLink.EqualsNoCase($@"http://{coreText}") ||
                    coreLink.EqualsNoCase($@"https://{coreText}");

                var replacement = isSame || string.IsNullOrEmpty(coreText) ? coreLink : $@"[{coreText}]({coreLink})";

                result = result.Replace(anchor.Value, replacement);
            }

            // --

            // Remove remaining tags like <a>, links, images,
            // comments etc - anything that's enclosed inside < >
            result = Regex.Replace(result,
                    @"<[^>]*>", string.Empty,
                    RegexOptions.IgnoreCase);

            // replace special characters:
            result = Regex.Replace(result,
                        @" ", " ",
                        RegexOptions.IgnoreCase);

            result = Regex.Replace(result,
                        @"&bull;", " * ",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&lsaquo;", "<",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&rsaquo;", ">",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&trade;", "(tm)",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&frasl;", "/",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&lt;", "<",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&gt;", ">",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&copy;", "(c)",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        @"&reg;", "(r)",
                        RegexOptions.IgnoreCase);
            // Remove all others. More can be added, see
            // http://hotwired.lycos.com/webmonkey/reference/special_characters/
            result = Regex.Replace(result,
                        @"&(.{2,6});", string.Empty,
                        RegexOptions.IgnoreCase);

            // for testing
            //Regex.Replace(result,
            //       this.txtRegex.Text,string.Empty,
            //       RegexOptions.IgnoreCase);

            // make line breaking consistent
            // ReSharper disable LocalizableElement
            result = result.Replace("\n", "\r");
            // ReSharper restore LocalizableElement

            // Remove extra line breaks and tabs:
            // replace over 2 breaks with 2 and over 4 tabs with 4.
            // Prepare first to remove any whitespaces in between
            // the escaped characters and remove redundant tabs in between line breaks
            result = Regex.Replace(result,
                        "(\r)( )+(\r)", "\r\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        "(\t)( )+(\t)", "\t\t",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        "(\t)( )+(\r)", "\t\r",
                        RegexOptions.IgnoreCase);
            result = Regex.Replace(result,
                        "(\r)( )+(\t)", "\r\t",
                        RegexOptions.IgnoreCase);
            // Remove redundant tabs
            result = Regex.Replace(result,
                        "(\r)(\t)+(\r)", "\r\r",
                        RegexOptions.IgnoreCase);
            // Remove multiple tabs following a line break with just one tab
            result = Regex.Replace(result,
                        "(\r)(\t)+", "\r\t",
                        RegexOptions.IgnoreCase);
            // Initial replacement target string for line breaks
            // ReSharper disable LocalizableElement
            var breaks = "\r\r\r";
            // ReSharper restore LocalizableElement
            // Initial replacement target string for tabs
            // ReSharper disable LocalizableElement
            var tabs = "\t\t\t\t\t";
            // ReSharper restore LocalizableElement
            for (var index = 0; index < result.Length; index++)
            {
                // ReSharper disable LocalizableElement
                result = result.Replace(breaks, "\r\r");
                result = result.Replace(tabs, "\t\t\t\t");
                breaks = breaks + "\r";
                tabs = tabs + "\t";
                // ReSharper restore LocalizableElement
            }

            // UK: Space at the beginning.
            // ReSharper disable LocalizableElement
            result = result.Replace("\r ", "\r");
            // ReSharper restore LocalizableElement

            // UK: Normalize.
            // ReSharper disable LocalizableElement
            result = result.Replace("\r", Environment.NewLine);
            // ReSharper restore LocalizableElement

            // That's it.
            return result.Trim();
        }
    }
}