Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package lucene;
- import java.io.IOException;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.core.KeywordAnalyzer;
- import org.apache.lucene.analysis.core.SimpleAnalyzer;
- import org.apache.lucene.analysis.de.GermanAnalyzer;
- import org.apache.lucene.analysis.en.EnglishAnalyzer;
- import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.MultiReader;
- import org.apache.lucene.queryparser.classic.ParseException;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.BooleanClause;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.WildcardQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.RAMDirectory;
- import org.apache.lucene.util.Version;
- import org.junit.Assert;
- import org.junit.Test;
- /**
- * Test searching on the lucene index.
- */
- public class TestLuceneIndex
- {
- final static String HERSENER = "Hersener";
- final static String BARBARA = "Barbara";
- final static String PK_BARBARA = "19j3bot15djhm8ai";
- final static String PK_HERSENER = "sfnj3ot15djhm8ai";
- static QueryParser getParser(Analyzer analyzer)
- {
- // change parser if wanted
- return new QueryParser(Version.LUCENE_40, "CONTENT", analyzer);
- }
- @Test
- public void testIndexWithGermanAnalyzer() throws Exception
- {
- PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
- Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
- DirectoryReader ireader = DirectoryReader.open(directory);
- MultiReader mr = new MultiReader(ireader);
- IndexSearcher searcher = new IndexSearcher(mr);
- checkUsingExactMatch(analyzer, searcher);
- checkUsingExactMatchAndStar(analyzer, searcher);
- }
- @Test
- public void testIndexWithSimpleAnalyzer() throws Exception
- {
- PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
- Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
- DirectoryReader ireader = DirectoryReader.open(directory);
- MultiReader mr = new MultiReader(ireader);
- IndexSearcher searcher = new IndexSearcher(mr);
- checkUsingExactMatch(analyzer, searcher);
- checkUsingExactMatchAndStar(analyzer, searcher);
- }
- @Test
- public void testIndexWithGermanAnalyzerStimmingProblem() throws Exception
- {
- PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
- Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
- DirectoryReader ireader = DirectoryReader.open(directory);
- MultiReader mr = new MultiReader(ireader);
- IndexSearcher searcher = new IndexSearcher(mr);
- checkUsingPartialMatchAndStar(analyzer, searcher);
- }
- @Test
- public void testIndexWithSimpleAnalyzerStimmingNoProblem() throws Exception
- {
- PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
- Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
- DirectoryReader ireader = DirectoryReader.open(directory);
- MultiReader mr = new MultiReader(ireader);
- IndexSearcher searcher = new IndexSearcher(mr);
- checkSimpleAnalyzerOnFamilies(analyzer, searcher);
- }
- @Test
- public void testIndexWithEnglishAnalyzerStimmingProblem() throws Exception
- {
- PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(2);
- Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
- DirectoryReader ireader = DirectoryReader.open(directory);
- MultiReader mr = new MultiReader(ireader);
- IndexSearcher searcher = new IndexSearcher(mr);
- checkSimpleAnalyzerOnFamilies(analyzer, searcher);
- }
- /**
- * This works as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
- *
- * @param analyzer The analyzer used to create the index
- * @param searcher The IndexSearcher to get the indexed data
- * @throws ParseException
- * @throws IOException
- */
- private void checkUsingExactMatch(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
- throws ParseException, IOException
- {
- QueryParser parser = TestLuceneIndex.getParser(analyzer);
- Query contentQuery = parser.parse(TestLuceneIndex.BARBARA);
- BooleanQuery query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 1: result count", 1, hits.length);
- Document hitDoc = searcher.doc(hits[0].doc);
- Assert.assertEquals("Test 2: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
- Assert.assertEquals("Test 3: result keytable content", "Erledigt", hitDoc.get("KEYTABLE_CONTENT"));
- Assert.assertEquals("Test 4: result PKSIMPLE", TestLuceneIndex.PK_BARBARA, hitDoc.get("PKSIMPLE"));
- Assert.assertNull("Test 5: result content", hitDoc.get("CONTENT"));
- contentQuery = parser.parse(TestLuceneIndex.HERSENER);
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 6: result count", 1, hits.length);
- hitDoc = searcher.doc(hits[0].doc);
- Assert.assertEquals("Test 7: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
- Assert.assertEquals("Test 8: result keytable content", "Nicht begonnen / offen",
- hitDoc.get("KEYTABLE_CONTENT"));
- Assert.assertEquals("Test 9: result PKSIMPLE", TestLuceneIndex.PK_HERSENER, hitDoc.get("PKSIMPLE"));
- Assert.assertNull("Test 10: result content", hitDoc.get("CONTENT"));
- }
- /**
- * This works **not** as expected using the {@link GermanAnalyzer}. Using the {@link SimpleAnalyzer} no stemming
- * is done and it works.
- *
- * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
- * The searcher does not stem the given search string when a star is added.
- * The query "Barbara*" will find the indexed string "Barbara" as expected.
- * The query "Hersener*" will **not** find the indexed string "Hersener" as expected when using the {@link GermanAnalyzer}.
- * So Test 2 will fail.
- *
- * @param analyzer The analyzer used to create the index
- * @param searcher The IndexSearcher to get the indexed data
- * @throws ParseException
- * @throws IOException
- */
- private void checkUsingExactMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
- throws ParseException, IOException
- {
- QueryParser parser = TestLuceneIndex.getParser(analyzer);
- Query contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
- BooleanQuery query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 1: result count", 1, hits.length);
- contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 2: (will fail using GermanAnalyzer but should not) result count", 1,
- hits.length);
- }
- /**
- * This works **not** as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
- *
- * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
- * The string "Barbara" will not be stemmed during indexing.
- * The searcher does not stem the given search string when a star is added.
- * The query "Barb*" will find the indexed string "Barbara" as expected.
- * The query "Hers*" will find the indexed string "Hersener" as expected.
- * The query "Barba*" will find the indexed string "Barbara" as expected.
- * The query "Herse*" will **not** find the indexed string "Hersener" as expected.
- * So Test 4 will fail.
- *
- * @param analyzer The analyzer used to create the index
- * @param searcher The IndexSearcher to get the indexed data
- * @throws ParseException
- * @throws IOException
- */
- private void checkUsingPartialMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
- throws ParseException, IOException
- {
- QueryParser parser = TestLuceneIndex.getParser(analyzer);
- Query contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 4) + "*");
- BooleanQuery query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 1: result count", 1, hits.length);
- contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 5) + "*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 2: result count", 1, hits.length);
- contentQuery = parser.parse(TestLuceneIndex.HERSENER.substring(0, 4) + "*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 3: result count", 1, hits.length);
- contentQuery =
- parser.parse(TestLuceneIndex.HERSENER.substring(
- 0, 5) + "*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 4: (will fail using GermanAnalyzer but should not) result count", 1,
- hits.length);
- }
- /**
- * This works **not** as expected using the {@link EnglishAnalyzer}.
- *
- * It seems the string "families" will be stemmed during indexing to "famili".
- * The searcher does not stem the given search string when a star is added, so the string is not found.
- * The query "families" will find the indexed string "families" as expected.
- * The query "famil*" will find the indexed string "families" as expected.
- * The query "famili*" will find the indexed string "families" as expected.
- * The query "familie*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
- * The query "families*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
- * So Test 16 will fail.
- *
- * @param analyzer The analyzer used to create the index
- * @param searcher The IndexSearcher to get the indexed data
- * @throws ParseException
- * @throws IOException
- */
- private void checkSimpleAnalyzerOnFamilies(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
- throws ParseException, IOException
- {
- QueryParser parser = TestLuceneIndex.getParser(analyzer);
- Query contentQuery = parser.parse("families");
- BooleanQuery query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 1: result count", 1, hits.length);
- contentQuery = parser.parse("famil*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 2: result count", 1, hits.length);
- contentQuery = parser.parse("famili*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 3: result count", 1, hits.length);
- contentQuery = parser.parse("familie*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 4: (will fail using EnglishAnalyzer but should not) result count", 1,
- hits.length);
- contentQuery = parser.parse("families*");
- query = new BooleanQuery();
- query.add(contentQuery, BooleanClause.Occur.MUST);
- hits = searcher.search(query, null, 100).scoreDocs;
- Assert.assertEquals("Test 5: (will fail using EnglishAnalyzer but should not) result count", 1,
- hits.length);
- }
- private static Directory getAndFillIndex(PerFieldAnalyzerWrapper analyzer) throws IOException
- {
- IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
- Directory directory = new RAMDirectory();
- IndexWriter writer = new IndexWriter(directory, config);
- Document doc;
- doc =
- TestLuceneIndex.getDocument("Held " + TestLuceneIndex.BARBARA, "Erledigt", "S_ACTSTATUS",
- TestLuceneIndex.PK_BARBARA);
- writer.addDocument(doc);
- doc = TestLuceneIndex.getDocument("Hero families", "Erledigt", "S_ACTSTATUS", "19j3bot15djhmwtf");
- writer.addDocument(doc);
- doc =
- TestLuceneIndex.getDocument(TestLuceneIndex.HERSENER, "Nicht begonnen / offen", "S_ACTSTATUS",
- TestLuceneIndex.PK_HERSENER);
- writer.addDocument(doc);
- doc = TestLuceneIndex.getDocument("Aguaplan", "Erledigt", "S_ACTSTATUS", "wzrt3bot15djhqdtbh");
- writer.addDocument(doc);
- writer.close();
- return directory;
- }
- private static Document getDocument(String content, String keytableContent, String keyrange,
- String pkSimple)
- {
- Document doc;
- doc = new Document();
- doc.add(new TextField("CONTENT", content, Field.Store.NO));
- doc.add(new TextField("KEYTABLE_CONTENT", keytableContent, Field.Store.YES));
- doc.add(new StringField("KEYRANGE", keyrange, Field.Store.YES));
- doc.add(new StringField("PKSIMPLE", pkSimple, Field.Store.YES));
- return doc;
- }
- /**
- * @param usedAnalyzer {@code true}: The GermanAnalyzer will be used as default analyzer and in the "CONTENT" field
- * @return A wrapper containing the analyzers to be used.
- */
- private static PerFieldAnalyzerWrapper getAnalyzer(int usedAnalyzer)
- {
- Analyzer sim = new SimpleAnalyzer(Version.LUCENE_40);
- Analyzer ger;
- switch (usedAnalyzer)
- {
- case 1 :
- ger = new GermanAnalyzer(Version.LUCENE_40, GermanAnalyzer.getDefaultStopSet());
- break;
- case 2 :
- ger = new EnglishAnalyzer(Version.LUCENE_40, EnglishAnalyzer.getDefaultStopSet());
- break;
- case 3 :
- default :
- ger = sim;
- break;
- }
- Analyzer key = new KeywordAnalyzer();
- Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
- fieldAnalyzers.put("KEYTABLE_CONTENT", sim);
- fieldAnalyzers.put("KEYRANGE", key);
- fieldAnalyzers.put("PKSIMPLE", key);
- fieldAnalyzers.put("CONTENT", ger);
- return new PerFieldAnalyzerWrapper(ger, fieldAnalyzers);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment