Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public static string ConvertHtmlToText(string source) {
- string result;
- // Remove HTML Development formatting
- // Replace line breaks with space
- // because browsers inserts space
- result = source.Replace("\r", " ");
- // Replace line breaks with space
- // because browsers inserts space
- result = result.Replace("\n", " ");
- // Remove step-formatting
- result = result.Replace("\t", string.Empty);
- // Remove repeating speces becuase browsers ignore them
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"( )+", " ");
- // Remove the header (prepare first by clearing attributes)
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*head([^>])*>", "<head>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"(<( )*(/)( )*head( )*>)", "</head>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(<head>).*(</head>)", string.Empty,
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // remove all scripts (prepare first by clearing attributes)
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*script([^>])*>", "<script>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"(<( )*(/)( )*script( )*>)", "</script>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- //result = System.Text.RegularExpressions.Regex.Replace(result,
- // @"(<script>)([^(<script>\.</script>)])*(</script>)",
- // string.Empty,
- // System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"(<script>).*(</script>)", string.Empty,
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // remove all styles (prepare first by clearing attributes)
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*style([^>])*>", "<style>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"(<( )*(/)( )*style( )*>)", "</style>",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(<style>).*(</style>)", string.Empty,
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // insert tabs in spaces of <td> tags
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*td([^>])*>", "\t",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // insert line breaks in places of <BR> and <LI> tags
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*br( )*>", "\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*li( )*>", "\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // insert line paragraphs (double line breaks) in place
- // if <P>, <DIV> and <TR> tags
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*div([^>])*>", "\r\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*tr([^>])*>", "\r\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<( )*p([^>])*>", "\r\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // Remove remaining tags like <a>, links, images,
- // comments etc - anything thats enclosed inside < >
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<[^>]*>", string.Empty,
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // replace special characters:
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @" ", " ",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"•", " * ",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"‹", "<",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"›", ">",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"™", "(tm)",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"⁄", "/",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"<", "<",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @">", ">",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"©", "(c)",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"®", "(r)",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // Remove all others. More can be added, see
- // http://hotwired.lycos.com/webmonkey/reference/special_characters/
- result = System.Text.RegularExpressions.Regex.Replace(result,
- @"&(.{2,6});", string.Empty,
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // make line breaking consistent
- result = result.Replace("\n", "\r");
- // Remove extra line breaks and tabs:
- // replace over 2 breaks with 2 and over 4 tabs with 4.
- // Prepare first to remove any whitespaces inbetween
- // the escaped characters and remove redundant tabs inbetween linebreaks
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\r)( )+(\r)", "\r\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\t)( )+(\t)", "\t\t",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\t)( )+(\r)", "\t\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\r)( )+(\t)", "\r\t",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // Remove redundant tabs
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\r)(\t)+(\r)", "\r\r",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // Remove multible tabs followind a linebreak with just one tab
- result = System.Text.RegularExpressions.Regex.Replace(result,
- "(\r)(\t)+", "\r\t",
- System.Text.RegularExpressions.RegexOptions.IgnoreCase);
- // Initial replacement target string for linebreaks
- string breaks = "\r\r\r";
- // Initial replacement target string for tabs
- string tabs = "\t\t\t\t\t";
- for (int index = 0; index < result.Length; index++) {
- result = result.Replace(breaks, "\r\r");
- result = result.Replace(tabs, "\t\t\t\t");
- breaks = breaks + "\r";
- tabs = tabs + "\t";
- }
- // Thats it.
- return result;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement