using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;
namespace QueryParser
{
/// <summary>
/// This class is used to parse a user entered string into a query
/// for use in a SQL statement with the CONTAINS clause for full-text searching
/// </summary>
/// <remarks>
/// This query parser is based heavily on code originally written by Robert Dominy (http://dev.angusog.com/).
/// It was originally in server-side JScript, and converted to C# for use in ASP.NET by Munsifali Rashid.
/// </remarks>
public class UserQueryParser
{
#region Constructor
static UserQueryParser()
{
m_BuiltInTokens["and"] = new Token(TokenType.AndOperator, "AND");
m_BuiltInTokens["or"] = new Token(TokenType.OrOperator, "OR");
m_BuiltInTokens["near"] = new Token(TokenType.NearOperator, "NEAR");
m_BuiltInTokens["not"] = new Token(TokenType.NotOperator, "NOT");
m_BuiltInTokens["("] = new Token(TokenType.LeftParenthis, "(");
m_BuiltInTokens[")"] = new Token(TokenType.RightParenthis, ")");
}
#endregion
#region Private variables
private string m_Error = string.Empty;
private readonly List<Token> m_Tokens = new List<Token>();
private static readonly List<String> m_NoiseWords = new List<string>();
private static readonly Dictionary<String, Token> m_BuiltInTokens = new Dictionary<string, Token>();
#endregion
#region Accessors
public string Error
{
get
{
return m_Error;
}
}
public static List<string> NoiseWords
{
get
{
return m_NoiseWords;
}
}
#endregion
/// <summary>
/// Gets the tokens in a string ready for use in a SQL query with the CONTAINS clause
/// </summary>
public string GetSqlQuery()
{
StringBuilder sb = new StringBuilder();
foreach (Token token in m_Tokens)
{
// Get the token value
string tokenValue = token.Value;
// Make it SQL safe
tokenValue = tokenValue.Replace("'", "''");
// Wrap the token value in quotes, if it's not in quotes already (ie. might be a phrase search)
if (token.TokenType == TokenType.UserItem && !tokenValue.StartsWith("\"") && !tokenValue.EndsWith("\""))
tokenValue = string.Format("\"{0}\"", tokenValue);
Debug.WriteLine(" - " + tokenValue);
// Append the token value to the list
sb.Append(tokenValue);
sb.Append(" ");
}
return sb.ToString().Trim();
}
/// <summary>
/// Parses the query and initialises the tokens.
/// </summary>
/// <param name="userQuery">The user query</param>
/// <returns>[True] if query is valid, otherwise [False]</returns>
public bool ParseTokens(string userQuery)
{
// First make sure that we've got an even number of quotes
if (CountQuotes(userQuery) % 2 != 0)
{
m_Error = "Invalid number of quote marks";
return false;
}
// Clean up query
userQuery = userQuery.ToLower();
// Query cannot start with a not operator, so remove it if applicable
if (userQuery.StartsWith("-"))
userQuery.Substring(1);
// Parse the query into tokens
const string pattern = @"(/\s*([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+\*)|([A-Za-z0-9'(-^"")_\u00C0-\u00FF]+(\*{0,1}))|(-{0,1}[""][^""]*[""])|([\(\)])\s*/)";
Regex re = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection matches = re.Matches(userQuery);
// Nothing parsed yet so no token
m_Tokens.Clear();
Token lastParsedToken = null;
// Parse the regex matches into a list of words
// which we'll then turn into a list of tokens.
List<string> words = ParseMatchesIntoWordList(matches);
// Iterate through all the matches in the query
foreach (string word in words)
{
// Get the token from the word
Token token = GetToken(word);
if (lastParsedToken != null && (lastParsedToken.TokenType == TokenType.NoiseWord) && (token.TokenType & TokenType.BinaryOperator) == token.TokenType)
{
// Skip this token since it joins a noise word
}
else if (lastParsedToken == null && (token.TokenType & TokenType.Operator) == token.TokenType)
{
// Skip this as query cannot start with an operator
}
else if (token.TokenType == TokenType.NoiseWord)
{
UnrollExpression(TokenType.Operator);
lastParsedToken = token;
}
else
{
// Get the last (previous) token
Token lastToken = GetLastToken();
if (token.TokenType == TokenType.UserItem)
{
// Check if there is a previous token and if it is an expression, then add an 'AND' operator
if (lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
{
m_Tokens.Add(m_BuiltInTokens["and"]);
}
}
else if (((token.TokenType & TokenType.NotOperator) == token.TokenType) && lastToken != null && (lastToken.TokenType & TokenType.Expression) == lastToken.TokenType)
{
// Same goes for not. If this token is a 'NOT' operator and the last token is an
// expression, then add an 'and' token to keep the syntax correct.
m_Tokens.Add(m_BuiltInTokens["and"]);
}
// Add the token to the list
m_Tokens.Add(token);
// Update the last parsed token to this one
lastParsedToken = token;
}
}
return IsValid();
}
#region Private Helper Stuff
/// <summary>
/// Validates the tokens and checks if they correctly form a query
/// </summary>
private bool IsValid()
{
if (m_Tokens.Count == 0)
{
m_Error = "Search string is empty";
return false;
}
bool valid = true;
bool lastItemOK = false;
TokenType nextItem = TokenType.UserItem | TokenType.LeftParenthis | TokenType.NotOperator;
int balance = 0;
for (int tokIndex = 0; tokIndex < m_Tokens.Count; tokIndex++)
{
Token token = m_Tokens[tokIndex];
if ((token.TokenType & nextItem) != 0)
{
switch (token.TokenType)
{
case (TokenType.UserItem):
nextItem = TokenType.BinaryOperator | TokenType.RightParenthis;
lastItemOK = true;
break;
case (TokenType.AndOperator):
nextItem = TokenType.UserItem | TokenType.NotOperator | TokenType.LeftParenthis;
lastItemOK = false;
break;
case (TokenType.NearOperator):
nextItem = TokenType.UserItem;
lastItemOK = false;
break;
case (TokenType.OrOperator):
nextItem = TokenType.UserItem | TokenType.LeftParenthis;
lastItemOK = false;
break;
case (TokenType.NotOperator):
nextItem = TokenType.UserItem | TokenType.LeftParenthis;
lastItemOK = false;
break;
case (TokenType.LeftParenthis):
balance++;
nextItem = TokenType.UserItem;
lastItemOK = false;
break;
case (TokenType.RightParenthis):
balance--;
nextItem = TokenType.OrOperator | TokenType.AndOperator;
lastItemOK = (balance <= 0);
break;
}
if (balance < 0)
{
valid = false;
m_Error = "Mismatched parenthesis";
break;
}
}
else
{
valid = false;
m_Error = "Unexpected word or character found: " + m_Tokens[tokIndex].Value;
break;
}
}
if (balance != 0)
{
valid = false;
m_Error = "Mismatched parenthesis";
}
else if (valid && !lastItemOK)
{
valid = false;
m_Error = "Unexpected end of search string after: " + m_Tokens[m_Tokens.Count - 1].Value;
}
return valid;
}
[Flags]
private enum TokenType
{
UserItem = 1,
AndOperator = 2,
OrOperator = 4,
NotOperator = 8,
LeftParenthis = 16,
RightParenthis = 32,
NearOperator = 64,
NoiseWord = 128,
Operator = AndOperator | OrOperator | NotOperator | NearOperator,
BinaryOperator = AndOperator | OrOperator | NearOperator,
Expression = RightParenthis | UserItem
}
/// <summary>
/// Gets a token from the specified text
/// </summary>
private static Token GetToken(string text)
{
if (m_BuiltInTokens.ContainsKey(text))
return m_BuiltInTokens[text];
Token token = new Token();
token.Value = text;
token.TokenType = m_NoiseWords.Contains(text) ? TokenType.NoiseWord : TokenType.UserItem;
return token;
}
/// <summary>
/// Gets the last token in the list. If there is no last token, null is returned
/// </summary>
private Token GetLastToken()
{
if (m_Tokens.Count > 0)
return m_Tokens[m_Tokens.Count - 1];
return null;
}
/// <summary>
/// Rolls back to the last token of the specified type.
/// All tokens after it are removed from the list.
/// </summary>
private void UnrollExpression(TokenType type)
{
for (int i = m_Tokens.Count; i > 0; i--)
{
Token tok = m_Tokens[i - 1];
if ((tok.TokenType & type) != 0)
{
m_Tokens.Remove(tok);
}
else
{
break;
}
}
}
/// <summary>
/// Counts how many times the quote (") character appears in the specified string
/// </summary>
private static int CountQuotes(IEnumerable<char> s)
{
int count = 0;
foreach (char c in s)
{
if (c == '"')
count++;
}
return count;
}
/// <summary>
/// Parses the match collection into a list of words or phrases that need to be tokenized
/// </summary>
private static List<string> ParseMatchesIntoWordList(MatchCollection matches)
{
// This will contain our list of raw words
List<String> wordList1 = new List<string>();
// The current word we've got. We store this to check if it's a valid
// word before we add it. This is because the regex doesn't parse
// all types of phrases correctly, so we need to build up the phrase
// before it can be added to the list.
// For example, "bunch of grapes" will be parsed by the regex as: "bunch, of, grapes"
// The problem with this is that it then breaks the SQL clause generator.
// Instead, we hack around this by converting this back into a single phrase as we
// build up the word list.
string currentWord = string.Empty;
foreach (Match match in matches)
{
if (currentWord == string.Empty)
{
// This is a new word, so set our current word
currentWord = match.Value;
// If the word starts with a quote or -quote, and doesn't end with one then
// skip to the next word. We need to do this until we find the word with
// the end quote, and add the whole word as single word to our word list.
// This is because the regex has trouble parsing all permutations of phrases
// so we're hacking around some of the problems by using this method.
if ((currentWord.StartsWith("-\"") || currentWord.StartsWith("\"")) && !currentWord.EndsWith("\""))
{
continue;
}
}
else
{
// Otherwise, we've got a word already which begins with a quote.
// First we need to append it to our list. Then we check if it doesn't
// end with a quote, and if so, move to the next word. This is so we can
// build up the phrase that are properly delimited.
currentWord += " " + match.Value;
if (!currentWord.EndsWith("\""))
{
continue;
}
}
// All done. Add our word or phrase to the list.
wordList1.Add(currentWord);
// Clear the word, so that we can work on the next one in the list.
currentWord = string.Empty;
}
// Raw list of words
List<String> wordList2 = new List<string>();
// Add each match to the word list
foreach (string w in wordList1)
{
// Get the word
string word = w;
// For words (or phrases) starting with a hypen
// remove it and insert 'not' before it in the tokenlist
if (word.StartsWith("-"))
{
word = word.Substring(1);
wordList2.Add("not");
}
wordList2.Add(word);
}
return wordList2;
}
private class Token
{
public TokenType TokenType;
public string Value;
public Token()
{
}
public Token(TokenType tokenType, string value)
{
TokenType = tokenType;
Value = value;
}
public override string ToString()
{
return Value;
}
}
#endregion
}
}