using System; using System.Collections.Generic; using System.Diagnostics; using System.Text; using System.Text.RegularExpressions; namespace QueryParser { /// /// This class is used to parse a user entered string into a query /// for use in a SQL statement with the CONTAINS clause for full-text searching /// /// /// This query parser is based heavily on code originally written by Robert Dominy (http://dev.angusog.com/). /// It was originally in server-side JScript, and converted to C# for use in ASP.NET by Munsifali Rashid. /// public class UserQueryParser { #region Constructor static UserQueryParser() { m_BuiltInTokens["and"] = new Token(TokenType.AndOperator, "AND"); m_BuiltInTokens["or"] = new Token(TokenType.OrOperator, "OR"); m_BuiltInTokens["near"] = new Token(TokenType.NearOperator, "NEAR"); m_BuiltInTokens["not"] = new Token(TokenType.NotOperator, "NOT"); m_BuiltInTokens["("] = new Token(TokenType.LeftParenthis, "("); m_BuiltInTokens[")"] = new Token(TokenType.RightParenthis, ")"); } #endregion #region Private variables private string m_Error = string.Empty; private readonly List m_Tokens = new List(); private static readonly List m_NoiseWords = new List(); private static readonly Dictionary m_BuiltInTokens = new Dictionary(); #endregion #region Accessors public string Error { get { return m_Error; } } public static List NoiseWords { get { return m_NoiseWords; } } #endregion /// /// Gets the tokens in a string ready for use in a SQL query with the CONTAINS clause /// public string GetSqlQuery() { StringBuilder sb = new StringBuilder(); foreach (Token token in m_Tokens) { // Get the token value string tokenValue = token.Value; // Make it SQL safe tokenValue = tokenValue.Replace("'", "''"); // Wrap the token value in quotes, if it's not in quotes already (ie. might be a phrase search) if (token.TokenType == TokenType.UserItem && !tokenValue.StartsWith("\"") && !tokenValue.EndsWith("\"")) tokenValue = string.Format("\"{0}\"", tokenValue); Debug.WriteLine(" - " + tokenValue); // Append the token value to the list sb.Append(tokenValue); sb.Append(" "); } return sb.ToString().Trim(); } /// /// Parses the query and initialises the tokens. /// /// The user query /// [True] if query is valid, otherwise [False] public bool ParseTokens(string userQuery) { // First make sure that we've got an even number of quotes if (CountQuotes(userQuery) % 2 != 0) { m_Error = "Invalid number of quote marks"; return false; } // Clean up query userQuery = userQuery.ToLower(); // Query cannot start with a not operator, so remove it if applicable if (userQuery.StartsWith("-")) userQuery.Substring(1); // Parse the query into tokens const string pattern = @"(/\s*([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+\*)|([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+(\*{0,1}))|(-{0,1}[""][^""]*[""])|([\(\)])\s*/)"; Regex re = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); MatchCollection matches = re.Matches(userQuery); // Nothing parsed yet so no token m_Tokens.Clear(); Token lastParsedToken = null; // Parse the regex matches into a list of words // which we'll then turn into a list of tokens. List words = ParseMatchesIntoWordList(matches); // Iterate through all the matches in the query foreach (string word in words) { // Get the token from the word Token token = GetToken(word); if (lastParsedToken != null && (lastParsedToken.TokenType == TokenType.NoiseWord) && (token.TokenType & TokenType.BinaryOperator) == token.TokenType) { // Skip this token since it joins a noise word } else if (lastParsedToken == null && (token.TokenType & TokenType.Operator) == token.TokenType) { // Skip this as query cannot start with an operator } else if (token.TokenType == TokenType.NoiseWord) { UnrollExpression(TokenType.Operator); lastParsedToken = token; } else { // Get the last (previous) token Token lastToken = GetLastToken(); if (token.TokenType == TokenType.UserItem) { // Check if there is a previous token and if it is an expression, then add an 'AND' operator if (lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType) { m_Tokens.Add(m_BuiltInTokens["and"]); } } else if (((token.TokenType & TokenType.NotOperator) == token.TokenType) && lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType) { // Same goes for not. If this token is a 'NOT' operator and the last token is an // expression, then add an 'and' token to keep the syntax correct. m_Tokens.Add(m_BuiltInTokens["and"]); } // Add the token to the list m_Tokens.Add(token); // Update the last parsed token to this one lastParsedToken = token; } } return IsValid(); } #region Private Helper Stuff /// /// Validates the tokens and checks if they correctly form a query /// private bool IsValid() { if (m_Tokens.Count == 0) { m_Error = "Search string is empty"; return false; } bool valid = true; bool lastItemOK = false; TokenType nextItem = TokenType.UserItem | TokenType.LeftParenthis | TokenType.NotOperator; int balance = 0; for (int tokIndex = 0; tokIndex < m_Tokens.Count; tokIndex++) { Token token = m_Tokens[tokIndex]; if ((token.TokenType & nextItem) != 0) { switch (token.TokenType) { case (TokenType.UserItem): nextItem = TokenType.BinaryOperator | TokenType.RightParenthis; lastItemOK = true; break; case (TokenType.AndOperator): nextItem = TokenType.UserItem | TokenType.NotOperator | TokenType.LeftParenthis; lastItemOK = false; break; case (TokenType.NearOperator): nextItem = TokenType.UserItem; lastItemOK = false; break; case (TokenType.OrOperator): nextItem = TokenType.UserItem | TokenType.LeftParenthis; lastItemOK = false; break; case (TokenType.NotOperator): nextItem = TokenType.UserItem | TokenType.LeftParenthis; lastItemOK = false; break; case (TokenType.LeftParenthis): balance++; nextItem = TokenType.UserItem; lastItemOK = false; break; case (TokenType.RightParenthis): balance--; nextItem = TokenType.OrOperator | TokenType.AndOperator; lastItemOK = (balance <= 0); break; } if (balance < 0) { valid = false; m_Error = "Mismatched parenthesis"; break; } } else { valid = false; m_Error = "Unexpected word or character found: " + m_Tokens[tokIndex].Value; break; } } if (balance != 0) { valid = false; m_Error = "Mismatched parenthesis"; } else if (valid && !lastItemOK) { valid = false; m_Error = "Unexpected end of search string after: " + m_Tokens[m_Tokens.Count - 1].Value; } return valid; } [Flags] private enum TokenType { UserItem = 1, AndOperator = 2, OrOperator = 4, NotOperator = 8, LeftParenthis = 16, RightParenthis = 32, NearOperator = 64, NoiseWord = 128, Operator = AndOperator | OrOperator | NotOperator | NearOperator, BinaryOperator = AndOperator | OrOperator | NearOperator, Expression = RightParenthis | UserItem } /// /// Gets a token from the specified text /// private static Token GetToken(string text) { if (m_BuiltInTokens.ContainsKey(text)) return m_BuiltInTokens[text]; Token token = new Token(); token.Value = text; token.TokenType = m_NoiseWords.Contains(text) ? TokenType.NoiseWord : TokenType.UserItem; return token; } /// /// Gets the last token in the list. If there is no last token, null is returned /// private Token GetLastToken() { if (m_Tokens.Count > 0) return m_Tokens[m_Tokens.Count - 1]; return null; } /// /// Rolls back to the last token of the specified type. /// All tokens after it are removed from the list. /// private void UnrollExpression(TokenType type) { for (int i = m_Tokens.Count; i > 0; i--) { Token tok = m_Tokens[i - 1]; if ((tok.TokenType & type) != 0) { m_Tokens.Remove(tok); } else { break; } } } /// /// Counts how many times the quote (") character appears in the specified string /// private static int CountQuotes(IEnumerable s) { int count = 0; foreach (char c in s) { if (c == '"') count++; } return count; } /// /// Parses the match collection into a list of words or phrases that need to be tokenized /// private static List ParseMatchesIntoWordList(MatchCollection matches) { // This will contain our list of raw words List wordList1 = new List(); // The current word we've got. We store this to check if it's a valid // word before we add it. This is because the regex doesn't parse // all types of phrases correctly, so we need to build up the phrase // before it can be added to the list. // For example, "bunch of grapes" will be parsed by the regex as: "bunch, of, grapes" // The problem with this is that it then breaks the SQL clause generator. // Instead, we hack around this by converting this back into a single phrase as we // build up the word list. string currentWord = string.Empty; foreach (Match match in matches) { if (currentWord == string.Empty) { // This is a new word, so set our current word currentWord = match.Value; // If the word starts with a quote or -quote, and doesn't end with one then // skip to the next word. We need to do this until we find the word with // the end quote, and add the whole word as single word to our word list. // This is because the regex has trouble parsing all permutations of phrases // so we're hacking around some of the problems by using this method. if ((currentWord.StartsWith("-\"") || currentWord.StartsWith("\"")) && !currentWord.EndsWith("\"")) { continue; } } else { // Otherwise, we've got a word already which begins with a quote. // First we need to append it to our list. Then we check if it doesn't // end with a quote, and if so, move to the next word. This is so we can // build up the phrase that are properly delimited. currentWord += " " + match.Value; if (!currentWord.EndsWith("\"")) { continue; } } // All done. Add our word or phrase to the list. wordList1.Add(currentWord); // Clear the word, so that we can work on the next one in the list. currentWord = string.Empty; } // Raw list of words List wordList2 = new List(); // Add each match to the word list foreach (string w in wordList1) { // Get the word string word = w; // For words (or phrases) starting with a hypen // remove it and insert 'not' before it in the tokenlist if (word.StartsWith("-")) { word = word.Substring(1); wordList2.Add("not"); } wordList2.Add(word); } return wordList2; } private class Token { public TokenType TokenType; public string Value; public Token() { } public Token(TokenType tokenType, string value) { TokenType = tokenType; Value = value; } public override string ToString() { return Value; } } #endregion } }