Lucene4-Wildcard-Stemming-Testcase

package lucene;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Assert;
import org.junit.Test;

/**
 * Test searching on the lucene index.
 */
public class TestLuceneIndex
{
    final static String HERSENER = "Hersener";
    final static String BARBARA = "Barbara";
    final static String PK_BARBARA = "19j3bot15djhm8ai";
    final static String PK_HERSENER = "sfnj3ot15djhm8ai";

    static QueryParser getParser(Analyzer analyzer)
    {
        // change parser if wanted
        return new QueryParser(Version.LUCENE_40, "CONTENT", analyzer);
    }

    @Test
    public void testIndexWithGermanAnalyzer() throws Exception
    {
        PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
        Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);

        DirectoryReader ireader = DirectoryReader.open(directory);
        MultiReader mr = new MultiReader(ireader);
        IndexSearcher searcher = new IndexSearcher(mr);

        checkUsingExactMatch(analyzer, searcher);
        checkUsingExactMatchAndStar(analyzer, searcher);
    }

    @Test
    public void testIndexWithSimpleAnalyzer() throws Exception
    {
        PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
        Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);

        DirectoryReader ireader = DirectoryReader.open(directory);
        MultiReader mr = new MultiReader(ireader);
        IndexSearcher searcher = new IndexSearcher(mr);

        checkUsingExactMatch(analyzer, searcher);
        checkUsingExactMatchAndStar(analyzer, searcher);
    }

    @Test
    public void testIndexWithGermanAnalyzerStimmingProblem() throws Exception
    {
        PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
        Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);

        DirectoryReader ireader = DirectoryReader.open(directory);
        MultiReader mr = new MultiReader(ireader);
        IndexSearcher searcher = new IndexSearcher(mr);

        checkUsingPartialMatchAndStar(analyzer, searcher);
    }

    @Test
    public void testIndexWithSimpleAnalyzerStimmingNoProblem() throws Exception
    {
        PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
        Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);

        DirectoryReader ireader = DirectoryReader.open(directory);
        MultiReader mr = new MultiReader(ireader);
        IndexSearcher searcher = new IndexSearcher(mr);

        checkSimpleAnalyzerOnFamilies(analyzer, searcher);
    }

    @Test
    public void testIndexWithEnglishAnalyzerStimmingProblem() throws Exception
    {
        PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(2);
        Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);

        DirectoryReader ireader = DirectoryReader.open(directory);
        MultiReader mr = new MultiReader(ireader);
        IndexSearcher searcher = new IndexSearcher(mr);

        checkSimpleAnalyzerOnFamilies(analyzer, searcher);
    }

    /**
     * This works as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
     *
     * @param analyzer          The analyzer used to create the index
     * @param searcher          The IndexSearcher to get the indexed data
     * @throws ParseException
     * @throws IOException
     */
    private void checkUsingExactMatch(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
        throws ParseException, IOException
    {
        QueryParser parser = TestLuceneIndex.getParser(analyzer);
        Query contentQuery = parser.parse(TestLuceneIndex.BARBARA);
        BooleanQuery query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 1: result count", 1, hits.length);

        Document hitDoc = searcher.doc(hits[0].doc);
        Assert.assertEquals("Test 2: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
        Assert.assertEquals("Test 3: result keytable content", "Erledigt", hitDoc.get("KEYTABLE_CONTENT"));
        Assert.assertEquals("Test 4: result PKSIMPLE", TestLuceneIndex.PK_BARBARA, hitDoc.get("PKSIMPLE"));
        Assert.assertNull("Test 5: result content", hitDoc.get("CONTENT"));

        contentQuery = parser.parse(TestLuceneIndex.HERSENER);
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 6: result count", 1, hits.length);

        hitDoc = searcher.doc(hits[0].doc);
        Assert.assertEquals("Test 7: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
        Assert.assertEquals("Test 8: result keytable content", "Nicht begonnen / offen",
            hitDoc.get("KEYTABLE_CONTENT"));
        Assert.assertEquals("Test 9: result PKSIMPLE", TestLuceneIndex.PK_HERSENER, hitDoc.get("PKSIMPLE"));
        Assert.assertNull("Test 10: result content", hitDoc.get("CONTENT"));
    }

    /**
     * This works **not** as expected using the {@link GermanAnalyzer}. Using the {@link SimpleAnalyzer} no stemming
     * is done and it works.
     *
     * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
     * The searcher does not stem the given search string when a star is added.
     * The query "Barbara*" will find the indexed string "Barbara" as expected.
     * The query "Hersener*" will **not** find the indexed string "Hersener" as expected when using the {@link GermanAnalyzer}.
     * So Test 2 will fail.
     *
     * @param analyzer          The analyzer used to create the index
     * @param searcher          The IndexSearcher to get the indexed data
     * @throws ParseException
     * @throws IOException
     */
    private void checkUsingExactMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
        throws ParseException, IOException
    {
        QueryParser parser = TestLuceneIndex.getParser(analyzer);
        Query contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
        BooleanQuery query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 1: result count", 1, hits.length);

        contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 2: (will fail using GermanAnalyzer but should not) result count", 1,
            hits.length);
    }

    /**
     * This works **not** as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
     *
     * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
     * The string "Barbara" will not be stemmed during indexing.
     * The searcher does not stem the given search string when a star is added.

     * The query "Barb*" will find the indexed string "Barbara" as expected.
     * The query "Hers*" will find the indexed string "Hersener" as expected.
     * The query "Barba*" will find the indexed string "Barbara" as expected.
     * The query "Herse*" will **not** find the indexed string "Hersener" as expected.
     * So Test 4 will fail.
     *
     * @param analyzer          The analyzer used to create the index
     * @param searcher          The IndexSearcher to get the indexed data
     * @throws ParseException
     * @throws IOException
     */
    private void checkUsingPartialMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
        throws ParseException, IOException
    {
        QueryParser parser = TestLuceneIndex.getParser(analyzer);
        Query contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 4) + "*");
        BooleanQuery query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 1: result count", 1, hits.length);

        contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 5) + "*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 2: result count", 1, hits.length);

        contentQuery = parser.parse(TestLuceneIndex.HERSENER.substring(0, 4) + "*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 3: result count", 1, hits.length);

        contentQuery =
 parser.parse(TestLuceneIndex.HERSENER.substring(
                0, 5) + "*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 4: (will fail using GermanAnalyzer but should not) result count", 1,
            hits.length);
    }

    /**
     * This works **not** as expected using the {@link EnglishAnalyzer}.
     *
     * It seems the string "families" will be stemmed during indexing to "famili".
     * The searcher does not stem the given search string when a star is added, so the string is not found.

     * The query "families" will find the indexed string "families" as expected.
     * The query "famil*" will find the indexed string "families" as expected.
     * The query "famili*" will find the indexed string "families" as expected.
     * The query "familie*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
     * The query "families*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
     * So Test 16 will fail.
     *
     * @param analyzer          The analyzer used to create the index
     * @param searcher          The IndexSearcher to get the indexed data
     * @throws ParseException
     * @throws IOException
     */
    private void checkSimpleAnalyzerOnFamilies(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
        throws ParseException, IOException
    {
        QueryParser parser = TestLuceneIndex.getParser(analyzer);
        Query contentQuery = parser.parse("families");
        BooleanQuery query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 1: result count", 1, hits.length);

        contentQuery = parser.parse("famil*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 2: result count", 1, hits.length);

        contentQuery = parser.parse("famili*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 3: result count", 1, hits.length);

        contentQuery = parser.parse("familie*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 4: (will fail using EnglishAnalyzer but should not) result count", 1,
            hits.length);

        contentQuery = parser.parse("families*");
        query = new BooleanQuery();
        query.add(contentQuery, BooleanClause.Occur.MUST);
        hits = searcher.search(query, null, 100).scoreDocs;
        Assert.assertEquals("Test 5: (will fail using EnglishAnalyzer but should not) result count", 1,
            hits.length);

    }

    private static Directory getAndFillIndex(PerFieldAnalyzerWrapper analyzer) throws IOException
    {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
        Directory directory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(directory, config);
        Document doc;
        doc =
            TestLuceneIndex.getDocument("Held " + TestLuceneIndex.BARBARA, "Erledigt", "S_ACTSTATUS",
                TestLuceneIndex.PK_BARBARA);
        writer.addDocument(doc);
        doc = TestLuceneIndex.getDocument("Hero families", "Erledigt", "S_ACTSTATUS", "19j3bot15djhmwtf");
        writer.addDocument(doc);
        doc =
            TestLuceneIndex.getDocument(TestLuceneIndex.HERSENER, "Nicht begonnen / offen", "S_ACTSTATUS",
                TestLuceneIndex.PK_HERSENER);
        writer.addDocument(doc);
        doc = TestLuceneIndex.getDocument("Aguaplan", "Erledigt", "S_ACTSTATUS", "wzrt3bot15djhqdtbh");
        writer.addDocument(doc);
        writer.close();
        return directory;
    }

    private static Document getDocument(String content, String keytableContent, String keyrange,
        String pkSimple)
    {
        Document doc;
        doc = new Document();
        doc.add(new TextField("CONTENT", content, Field.Store.NO));
        doc.add(new TextField("KEYTABLE_CONTENT", keytableContent, Field.Store.YES));
        doc.add(new StringField("KEYRANGE", keyrange, Field.Store.YES));
        doc.add(new StringField("PKSIMPLE", pkSimple, Field.Store.YES));
        return doc;
    }

    /**
     * @param usedAnalyzer {@code true}: The GermanAnalyzer will be used as default analyzer and in the "CONTENT" field
     * @return A wrapper containing the analyzers to be used.
     */
    private static PerFieldAnalyzerWrapper getAnalyzer(int usedAnalyzer)
    {
        Analyzer sim = new SimpleAnalyzer(Version.LUCENE_40);
        Analyzer ger;
        switch (usedAnalyzer)
        {
            case 1 :
                ger = new GermanAnalyzer(Version.LUCENE_40, GermanAnalyzer.getDefaultStopSet());
                break;

            case 2 :
                ger = new EnglishAnalyzer(Version.LUCENE_40, EnglishAnalyzer.getDefaultStopSet());
                break;
            case 3 :
            default :
                ger = sim;
                break;
        }

        Analyzer key = new KeywordAnalyzer();

        Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
        fieldAnalyzers.put("KEYTABLE_CONTENT", sim);
        fieldAnalyzers.put("KEYRANGE", key);
        fieldAnalyzers.put("PKSIMPLE", key);
        fieldAnalyzers.put("CONTENT", ger);

        return new PerFieldAnalyzerWrapper(ger, fieldAnalyzers);
    }

}