Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.Text;
- using System.Text.RegularExpressions;
- namespace QueryParser
- {
- /// <summary>
- /// This class is used to parse a user entered string into a query
- /// for use in a SQL statement with the CONTAINS clause for full-text searching
- /// </summary>
- /// <remarks>
- /// This query parser is based heavily on code originally written by Robert Dominy (http://dev.angusog.com/).
- /// It was originally in server-side JScript, and converted to C# for use in ASP.NET by Munsifali Rashid.
- /// </remarks>
- public class UserQueryParser
- {
- #region Constructor
- static UserQueryParser()
- {
- m_BuiltInTokens["and"] = new Token(TokenType.AndOperator, "AND");
- m_BuiltInTokens["or"] = new Token(TokenType.OrOperator, "OR");
- m_BuiltInTokens["near"] = new Token(TokenType.NearOperator, "NEAR");
- m_BuiltInTokens["not"] = new Token(TokenType.NotOperator, "NOT");
- m_BuiltInTokens["("] = new Token(TokenType.LeftParenthis, "(");
- m_BuiltInTokens[")"] = new Token(TokenType.RightParenthis, ")");
- }
- #endregion
- #region Private variables
- private string m_Error = string.Empty;
- private readonly List<Token> m_Tokens = new List<Token>();
- private static readonly List<String> m_NoiseWords = new List<string>();
- private static readonly Dictionary<String, Token> m_BuiltInTokens = new Dictionary<string, Token>();
- #endregion
- #region Accessors
- public string Error
- {
- get
- {
- return m_Error;
- }
- }
- public static List<string> NoiseWords
- {
- get
- {
- return m_NoiseWords;
- }
- }
- #endregion
- /// <summary>
- /// Gets the tokens in a string ready for use in a SQL query with the CONTAINS clause
- /// </summary>
- public string GetSqlQuery()
- {
- StringBuilder sb = new StringBuilder();
- foreach (Token token in m_Tokens)
- {
- // Get the token value
- string tokenValue = token.Value;
- // Make it SQL safe
- tokenValue = tokenValue.Replace("'", "''");
- // Wrap the token value in quotes, if it's not in quotes already (ie. might be a phrase search)
- if (token.TokenType == TokenType.UserItem && !tokenValue.StartsWith("\"") && !tokenValue.EndsWith("\""))
- tokenValue = string.Format("\"{0}\"", tokenValue);
- Debug.WriteLine(" - " + tokenValue);
- // Append the token value to the list
- sb.Append(tokenValue);
- sb.Append(" ");
- }
- return sb.ToString().Trim();
- }
- /// <summary>
- /// Parses the query and initialises the tokens.
- /// </summary>
- /// <param name="userQuery">The user query</param>
- /// <returns>[True] if query is valid, otherwise [False]</returns>
- public bool ParseTokens(string userQuery)
- {
- // First make sure that we've got an even number of quotes
- if (CountQuotes(userQuery) % 2 != 0)
- {
- m_Error = "Invalid number of quote marks";
- return false;
- }
- // Clean up query
- userQuery = userQuery.ToLower();
- // Query cannot start with a not operator, so remove it if applicable
- if (userQuery.StartsWith("-"))
- userQuery.Substring(1);
- // Parse the query into tokens
- const string pattern = @"(/\s*([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+\*)|([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+(\*{0,1}))|(-{0,1}[""][^""]*[""])|([\(\)])\s*/)";
- Regex re = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
- MatchCollection matches = re.Matches(userQuery);
- // Nothing parsed yet so no token
- m_Tokens.Clear();
- Token lastParsedToken = null;
- // Parse the regex matches into a list of words
- // which we'll then turn into a list of tokens.
- List<string> words = ParseMatchesIntoWordList(matches);
- // Iterate through all the matches in the query
- foreach (string word in words)
- {
- // Get the token from the word
- Token token = GetToken(word);
- if (lastParsedToken != null && (lastParsedToken.TokenType == TokenType.NoiseWord) && (token.TokenType & TokenType.BinaryOperator) == token.TokenType)
- {
- // Skip this token since it joins a noise word
- }
- else if (lastParsedToken == null && (token.TokenType & TokenType.Operator) == token.TokenType)
- {
- // Skip this as query cannot start with an operator
- }
- else if (token.TokenType == TokenType.NoiseWord)
- {
- UnrollExpression(TokenType.Operator);
- lastParsedToken = token;
- }
- else
- {
- // Get the last (previous) token
- Token lastToken = GetLastToken();
- if (token.TokenType == TokenType.UserItem)
- {
- // Check if there is a previous token and if it is an expression, then add an 'AND' operator
- if (lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
- {
- m_Tokens.Add(m_BuiltInTokens["and"]);
- }
- }
- else if (((token.TokenType & TokenType.NotOperator) == token.TokenType) && lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
- {
- // Same goes for not. If this token is a 'NOT' operator and the last token is an
- // expression, then add an 'and' token to keep the syntax correct.
- m_Tokens.Add(m_BuiltInTokens["and"]);
- }
- // Add the token to the list
- m_Tokens.Add(token);
- // Update the last parsed token to this one
- lastParsedToken = token;
- }
- }
- return IsValid();
- }
- #region Private Helper Stuff
- /// <summary>
- /// Validates the tokens and checks if they correctly form a query
- /// </summary>
- private bool IsValid()
- {
- if (m_Tokens.Count == 0)
- {
- m_Error = "Search string is empty";
- return false;
- }
- bool valid = true;
- bool lastItemOK = false;
- TokenType nextItem = TokenType.UserItem | TokenType.LeftParenthis | TokenType.NotOperator;
- int balance = 0;
- for (int tokIndex = 0; tokIndex < m_Tokens.Count; tokIndex++)
- {
- Token token = m_Tokens[tokIndex];
- if ((token.TokenType & nextItem) != 0)
- {
- switch (token.TokenType)
- {
- case (TokenType.UserItem):
- nextItem = TokenType.BinaryOperator | TokenType.RightParenthis;
- lastItemOK = true;
- break;
- case (TokenType.AndOperator):
- nextItem = TokenType.UserItem | TokenType.NotOperator | TokenType.LeftParenthis;
- lastItemOK = false;
- break;
- case (TokenType.NearOperator):
- nextItem = TokenType.UserItem;
- lastItemOK = false;
- break;
- case (TokenType.OrOperator):
- nextItem = TokenType.UserItem | TokenType.LeftParenthis;
- lastItemOK = false;
- break;
- case (TokenType.NotOperator):
- nextItem = TokenType.UserItem | TokenType.LeftParenthis;
- lastItemOK = false;
- break;
- case (TokenType.LeftParenthis):
- balance++;
- nextItem = TokenType.UserItem;
- lastItemOK = false;
- break;
- case (TokenType.RightParenthis):
- balance--;
- nextItem = TokenType.OrOperator | TokenType.AndOperator;
- lastItemOK = (balance <= 0);
- break;
- }
- if (balance < 0)
- {
- valid = false;
- m_Error = "Mismatched parenthesis";
- break;
- }
- }
- else
- {
- valid = false;
- m_Error = "Unexpected word or character found: " + m_Tokens[tokIndex].Value;
- break;
- }
- }
- if (balance != 0)
- {
- valid = false;
- m_Error = "Mismatched parenthesis";
- }
- else if (valid && !lastItemOK)
- {
- valid = false;
- m_Error = "Unexpected end of search string after: " + m_Tokens[m_Tokens.Count - 1].Value;
- }
- return valid;
- }
- [Flags]
- private enum TokenType
- {
- UserItem = 1,
- AndOperator = 2,
- OrOperator = 4,
- NotOperator = 8,
- LeftParenthis = 16,
- RightParenthis = 32,
- NearOperator = 64,
- NoiseWord = 128,
- Operator = AndOperator | OrOperator | NotOperator | NearOperator,
- BinaryOperator = AndOperator | OrOperator | NearOperator,
- Expression = RightParenthis | UserItem
- }
- /// <summary>
- /// Gets a token from the specified text
- /// </summary>
- private static Token GetToken(string text)
- {
- if (m_BuiltInTokens.ContainsKey(text))
- return m_BuiltInTokens[text];
- Token token = new Token();
- token.Value = text;
- token.TokenType = m_NoiseWords.Contains(text) ? TokenType.NoiseWord : TokenType.UserItem;
- return token;
- }
- /// <summary>
- /// Gets the last token in the list. If there is no last token, null is returned
- /// </summary>
- private Token GetLastToken()
- {
- if (m_Tokens.Count > 0)
- return m_Tokens[m_Tokens.Count - 1];
- return null;
- }
- /// <summary>
- /// Rolls back to the last token of the specified type.
- /// All tokens after it are removed from the list.
- /// </summary>
- private void UnrollExpression(TokenType type)
- {
- for (int i = m_Tokens.Count; i > 0; i--)
- {
- Token tok = m_Tokens[i - 1];
- if ((tok.TokenType & type) != 0)
- {
- m_Tokens.Remove(tok);
- }
- else
- {
- break;
- }
- }
- }
- /// <summary>
- /// Counts how many times the quote (") character appears in the specified string
- /// </summary>
- private static int CountQuotes(IEnumerable<char> s)
- {
- int count = 0;
- foreach (char c in s)
- {
- if (c == '"')
- count++;
- }
- return count;
- }
- /// <summary>
- /// Parses the match collection into a list of words or phrases that need to be tokenized
- /// </summary>
- private static List<string> ParseMatchesIntoWordList(MatchCollection matches)
- {
- // This will contain our list of raw words
- List<String> wordList1 = new List<string>();
- // The current word we've got. We store this to check if it's a valid
- // word before we add it. This is because the regex doesn't parse
- // all types of phrases correctly, so we need to build up the phrase
- // before it can be added to the list.
- // For example, "bunch of grapes" will be parsed by the regex as: "bunch, of, grapes"
- // The problem with this is that it then breaks the SQL clause generator.
- // Instead, we hack around this by converting this back into a single phrase as we
- // build up the word list.
- string currentWord = string.Empty;
- foreach (Match match in matches)
- {
- if (currentWord == string.Empty)
- {
- // This is a new word, so set our current word
- currentWord = match.Value;
- // If the word starts with a quote or -quote, and doesn't end with one then
- // skip to the next word. We need to do this until we find the word with
- // the end quote, and add the whole word as single word to our word list.
- // This is because the regex has trouble parsing all permutations of phrases
- // so we're hacking around some of the problems by using this method.
- if ((currentWord.StartsWith("-\"") || currentWord.StartsWith("\"")) && !currentWord.EndsWith("\""))
- {
- continue;
- }
- }
- else
- {
- // Otherwise, we've got a word already which begins with a quote.
- // First we need to append it to our list. Then we check if it doesn't
- // end with a quote, and if so, move to the next word. This is so we can
- // build up the phrase that are properly delimited.
- currentWord += " " + match.Value;
- if (!currentWord.EndsWith("\""))
- {
- continue;
- }
- }
- // All done. Add our word or phrase to the list.
- wordList1.Add(currentWord);
- // Clear the word, so that we can work on the next one in the list.
- currentWord = string.Empty;
- }
- // Raw list of words
- List<String> wordList2 = new List<string>();
- // Add each match to the word list
- foreach (string w in wordList1)
- {
- // Get the word
- string word = w;
- // For words (or phrases) starting with a hypen
- // remove it and insert 'not' before it in the tokenlist
- if (word.StartsWith("-"))
- {
- word = word.Substring(1);
- wordList2.Add("not");
- }
- wordList2.Add(word);
- }
- return wordList2;
- }
- private class Token
- {
- public TokenType TokenType;
- public string Value;
- public Token()
- {
- }
- public Token(TokenType tokenType, string value)
- {
- TokenType = tokenType;
- Value = value;
- }
- public override string ToString()
- {
- return Value;
- }
- }
- #endregion
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement