Mun

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;

namespace QueryParser
{
    /// <summary>
    ///     This class is used to parse a user entered string into a query
    ///     for use in a SQL statement with the CONTAINS clause for full-text searching
    /// </summary>
    /// <remarks>
    ///     This query parser is based heavily on code originally written by Robert Dominy (http://dev.angusog.com/).
    ///     It was originally in server-side JScript, and converted to C# for use in ASP.NET by Munsifali Rashid.
    /// </remarks>
    public class UserQueryParser
    {
        #region Constructor

        static UserQueryParser()
        {
            m_BuiltInTokens["and"] = new Token(TokenType.AndOperator, "AND");
            m_BuiltInTokens["or"] = new Token(TokenType.OrOperator, "OR");
            m_BuiltInTokens["near"] = new Token(TokenType.NearOperator, "NEAR");
            m_BuiltInTokens["not"] = new Token(TokenType.NotOperator, "NOT");
            m_BuiltInTokens["("] = new Token(TokenType.LeftParenthis, "(");
            m_BuiltInTokens[")"] = new Token(TokenType.RightParenthis, ")");
        }

        #endregion

        #region Private variables

        private string m_Error = string.Empty;
        private readonly List<Token> m_Tokens = new List<Token>();
        private static readonly List<String> m_NoiseWords = new List<string>();
        private static readonly Dictionary<String, Token> m_BuiltInTokens = new Dictionary<string, Token>();

        #endregion

        #region Accessors

        public string Error
        {
            get
            {
                return m_Error;
            }
        }

        public static List<string> NoiseWords
        {
            get
            {
                return m_NoiseWords;
            }
        }

        #endregion

        /// <summary>
        /// Gets the tokens in a string ready for use in a SQL query with the CONTAINS clause
        /// </summary>
        public string GetSqlQuery()
        {
            StringBuilder sb = new StringBuilder();

            foreach (Token token in m_Tokens)
            {
                // Get the token value
                string tokenValue = token.Value;

                // Make it SQL safe
                tokenValue = tokenValue.Replace("'", "''");

                // Wrap the token value in quotes, if it's not in quotes already (ie. might be a phrase search)
                if (token.TokenType == TokenType.UserItem && !tokenValue.StartsWith("\"") && !tokenValue.EndsWith("\""))
                    tokenValue = string.Format("\"{0}\"", tokenValue);

                Debug.WriteLine(" - " + tokenValue);

                // Append the token value to the list
                sb.Append(tokenValue);
                sb.Append(" ");
            }

            return sb.ToString().Trim();
        }

        /// <summary>
        /// Parses the query and initialises the tokens.
        /// </summary>
        /// <param name="userQuery">The user query</param>
        /// <returns>[True] if query is valid, otherwise [False]</returns>
        public bool ParseTokens(string userQuery)
        {
            // First make sure that we've got an even number of quotes
            if (CountQuotes(userQuery) % 2 != 0)
            {
                m_Error = "Invalid number of quote marks";
                return false;
            }

            // Clean up query
            userQuery = userQuery.ToLower();

            // Query cannot start with a not operator, so remove it if applicable
            if (userQuery.StartsWith("-"))
                userQuery.Substring(1);

            // Parse the query into tokens
            const string pattern = @"(/\s*([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+\*)|([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+(\*{0,1}))|(-{0,1}[""][^""]*[""])|([\(\)])\s*/)";
            Regex re = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            MatchCollection matches = re.Matches(userQuery);

            // Nothing parsed yet so no token
            m_Tokens.Clear();
            Token lastParsedToken = null;

            // Parse the regex matches into a list of words
            // which we'll then turn into a list of tokens.
            List<string> words = ParseMatchesIntoWordList(matches);

            // Iterate through all the matches in the query
            foreach (string word in words)
            {
                // Get the token from the word
                Token token = GetToken(word);

                if (lastParsedToken != null && (lastParsedToken.TokenType == TokenType.NoiseWord) && (token.TokenType & TokenType.BinaryOperator) == token.TokenType)
                {
                    // Skip this token since it joins a noise word
                }
                else if (lastParsedToken == null && (token.TokenType & TokenType.Operator) == token.TokenType)
                {
                    // Skip this as query cannot start with an operator
                }
                else if (token.TokenType == TokenType.NoiseWord)
                {
                    UnrollExpression(TokenType.Operator);
                    lastParsedToken = token;
                }
                else
                {
                    // Get the last (previous) token
                    Token lastToken = GetLastToken();

                    if (token.TokenType == TokenType.UserItem)
                    {
                        // Check if there is a previous token and if it is an expression, then add an 'AND' operator
                        if (lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
                        {
                            m_Tokens.Add(m_BuiltInTokens["and"]);
                        }
                    }
                    else if (((token.TokenType & TokenType.NotOperator) == token.TokenType) && lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
                    {
                        // Same goes for not.  If this token is a 'NOT' operator and the last token is an
                        // expression, then add an 'and' token to keep the syntax correct.
                        m_Tokens.Add(m_BuiltInTokens["and"]);
                    }

                    // Add the token to the list
                    m_Tokens.Add(token);

                    // Update the last parsed token to this one
                    lastParsedToken = token;
                }
            }

            return IsValid();
        }

        #region Private Helper Stuff

        /// <summary>
        /// Validates the tokens and checks if they correctly form a query
        /// </summary>
        private bool IsValid()
        {
            if (m_Tokens.Count == 0)
            {
                m_Error = "Search string is empty";
                return false;
            }

            bool valid = true;
            bool lastItemOK = false;
            TokenType nextItem = TokenType.UserItem | TokenType.LeftParenthis | TokenType.NotOperator;
            int balance = 0;

            for (int tokIndex = 0; tokIndex < m_Tokens.Count; tokIndex++)
            {
                Token token = m_Tokens[tokIndex];

                if ((token.TokenType & nextItem) != 0)
                {
                    switch (token.TokenType)
                    {
                        case (TokenType.UserItem):
                            nextItem = TokenType.BinaryOperator | TokenType.RightParenthis;
                            lastItemOK = true;
                            break;

                        case (TokenType.AndOperator):
                            nextItem = TokenType.UserItem | TokenType.NotOperator | TokenType.LeftParenthis;
                            lastItemOK = false;
                            break;

                        case (TokenType.NearOperator):
                            nextItem = TokenType.UserItem;
                            lastItemOK = false;
                            break;

                        case (TokenType.OrOperator):
                            nextItem = TokenType.UserItem | TokenType.LeftParenthis;
                            lastItemOK = false;
                            break;

                        case (TokenType.NotOperator):
                            nextItem = TokenType.UserItem | TokenType.LeftParenthis;
                            lastItemOK = false;
                            break;

                        case (TokenType.LeftParenthis):
                            balance++;
                            nextItem = TokenType.UserItem;
                            lastItemOK = false;
                            break;

                        case (TokenType.RightParenthis):
                            balance--;
                            nextItem = TokenType.OrOperator | TokenType.AndOperator;
                            lastItemOK = (balance <= 0);
                            break;
                    }
                    if (balance < 0)
                    {
                        valid = false;
                        m_Error = "Mismatched parenthesis";
                        break;
                    }
                }
                else
                {
                    valid = false;
                    m_Error = "Unexpected word or character found: " + m_Tokens[tokIndex].Value;
                    break;
                }
            }

            if (balance != 0)
            {
                valid = false;
                m_Error = "Mismatched parenthesis";
            }
            else if (valid && !lastItemOK)
            {
                valid = false;
                m_Error = "Unexpected end of search string after: " + m_Tokens[m_Tokens.Count - 1].Value;
            }

            return valid;
        }

        [Flags]
        private enum TokenType
        {
            UserItem = 1,
            AndOperator = 2,
            OrOperator = 4,
            NotOperator = 8,
            LeftParenthis = 16,
            RightParenthis = 32,
            NearOperator = 64,
            NoiseWord = 128,

            Operator = AndOperator | OrOperator | NotOperator | NearOperator,
            BinaryOperator = AndOperator | OrOperator | NearOperator,
            Expression = RightParenthis | UserItem
        }

        /// <summary>
        /// Gets a token from the specified text
        /// </summary>
        private static Token GetToken(string text)
        {
            if (m_BuiltInTokens.ContainsKey(text))
                return m_BuiltInTokens[text];

            Token token = new Token();
            token.Value = text;
            token.TokenType = m_NoiseWords.Contains(text) ? TokenType.NoiseWord : TokenType.UserItem;

            return token;
        }

        /// <summary>
        /// Gets the last token in the list.  If there is no last token, null is returned
        /// </summary>
        private Token GetLastToken()
        {
            if (m_Tokens.Count > 0)
                return m_Tokens[m_Tokens.Count - 1];

            return null;
        }

        /// <summary>
        /// Rolls back to the last token of the specified type.
        /// All tokens after it are removed from the list.
        /// </summary>
        private void UnrollExpression(TokenType type)
        {
            for (int i = m_Tokens.Count; i > 0; i--)
            {
                Token tok = m_Tokens[i - 1];

                if ((tok.TokenType & type) != 0)
                {
                    m_Tokens.Remove(tok);
                }
                else
                {
                    break;
                }
            }
        }

        /// <summary>
        /// Counts how many times the quote (") character appears in the specified string
        /// </summary>
        private static int CountQuotes(IEnumerable<char> s)
        {
            int count = 0;

            foreach (char c in s)
            {
                if (c == '"')
                    count++;
            }

            return count;
        }

        /// <summary>
        /// Parses the match collection into a list of words or phrases that need to be tokenized
        /// </summary>
        private static List<string> ParseMatchesIntoWordList(MatchCollection matches)
        {
            // This will contain our list of raw words
            List<String> wordList1 = new List<string>();

            // The current word we've got.  We store this to check if it's a valid
            // word before we add it.  This is because the regex doesn't parse
            // all types of phrases correctly, so we need to build up the phrase
            // before it can be added to the list.

            // For example, "bunch of grapes" will be parsed by the regex as: "bunch, of, grapes"
            // The problem with this is that it then breaks the SQL clause generator.
            // Instead, we hack around this by converting this back into a single phrase as we
            // build up the word list.

            string currentWord = string.Empty;

            foreach (Match match in matches)
            {
                if (currentWord == string.Empty)
                {
                    // This is a new word, so set our current word
                    currentWord = match.Value;

                    // If the word starts with a quote or -quote, and doesn't end with one then
                    // skip to the next word.  We need to do this until we find the word with
                    // the end quote, and add the whole word as single word to our word list.
                    // This is because the regex has trouble parsing all permutations of phrases
                    // so we're hacking around some of the problems by using this method.

                    if ((currentWord.StartsWith("-\"") || currentWord.StartsWith("\"")) && !currentWord.EndsWith("\""))
                    {
                        continue;
                    }
                }
                else
                {
                    // Otherwise, we've got a word already which begins with a quote.
                    // First we need to append it to our list.  Then we check if it doesn't
                    // end with a quote, and if so, move to the next word.  This is so we can
                    // build up the phrase that are properly delimited.

                    currentWord += " " + match.Value;

                    if (!currentWord.EndsWith("\""))
                    {
                        continue;
                    }
                }

                // All done.  Add our word or phrase to the list.
                wordList1.Add(currentWord);

                // Clear the word, so that we can work on the next one in the list.
                currentWord = string.Empty;
            }

            // Raw list of words
            List<String> wordList2 = new List<string>();

            // Add each match to the word list
            foreach (string w in wordList1)
            {
                // Get the word
                string word = w;

                // For words (or phrases) starting with a hypen
                // remove it and insert 'not' before it in the tokenlist
                if (word.StartsWith("-"))
                {
                    word = word.Substring(1);
                    wordList2.Add("not");
                }

                wordList2.Add(word);
            }

            return wordList2;
        }

        private class Token
        {
            public TokenType TokenType;
            public string Value;

            public Token()
            {
            }

            public Token(TokenType tokenType, string value)
            {
                TokenType = tokenType;
                Value = value;
            }

            public override string ToString()
            {
                return Value;
            }
        }

        #endregion
    }
}