Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Public Function CleanText(ByVal str As String) As String
- 'removes HTML tags and other characters that title tags and descriptions don't like
- If Not String.IsNullOrEmpty(str) Then
- 'mini db of extended tags to get rid of
- Dim indexChars() As String = {"<a", "<img", "<input type=""hidden"" name=""tax""", "<input type=""hidden"" name=""handling""", "<span", "<p", "<ul", "<div", "<embed", "<object", "<param"}
- For i As Integer = 0 To indexChars.GetUpperBound(0) 'loop through indexchars array
- Dim indexOfInput As Integer = 0
- Do 'get rid of links
- indexOfInput = str.IndexOf(indexChars(i)) 'find instance of indexChar
- If indexOfInput <> -1 Then
- Dim indexNextLeftBracket As Integer = str.IndexOf("<", indexOfInput) + 1
- Dim indexRightBracket As Integer = str.IndexOf(">", indexOfInput) + 1
- 'check to make sure a right bracket hasn't been left off a tag
- If indexNextLeftBracket > indexRightBracket Then 'normal case
- str = str.Remove(indexOfInput, indexRightBracket - indexOfInput)
- Else
- 'add the right bracket right before the next left bracket, just remove everything
- 'in the bad tag
- str = str.Insert(indexNextLeftBracket - 1, ">")
- indexRightBracket = str.IndexOf(">", indexOfInput) + 1
- str = str.Remove(indexOfInput, indexRightBracket - indexOfInput)
- End If
- End If
- Loop Until indexOfInput = -1
- Next
- End If
- Return str
- End Function
- public string StripHTMLTags(string text)
- {
- return Regex.Replace(text, @"<(.|n)*?>", string.Empty);
- }
- @"<(?:.|n)*?>"
- Dim indexNextLeftBracket As Integer = str.IndexOf("<", indexOfInput) + 1
- Dim indexNextLeftBracket As Integer = str.IndexOf("<", indexOfInput+1) + 1
- public static string RemoveTags( string html, params string[] allowList )
- {
- if( html == null ) return null;
- Regex regex = new Regex( @"(?<Tag><(?<TagName>[a-z/]+)S*?[^<]*?>)",
- RegexOptions.Compiled |
- RegexOptions.IgnoreCase |
- RegexOptions.Multiline );
- return regex.Replace(
- html,
- new MatchEvaluator(
- new TagMatchEvaluator( allowList ).Replace ) );
- }
- private class TagMatchEvaluator
- {
- private readonly ArrayList _allowed = null;
- public TagMatchEvaluator( string[] allowList )
- {
- _allowed = new ArrayList( allowList );
- }
- public string Replace( Match match )
- {
- if( _allowed.Contains( match.Groups[ "TagName" ].Value ) )
- return match.Value;
- return "";
- }
- }
Add Comment
Please, Sign In to add comment