courson

Lucene4-Wildcard-Stemming-Testcase

Dec 11th, 2012
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 15.79 KB | None | 0 0
  1. package lucene;
  2.  
  3. import java.io.IOException;
  4. import java.util.HashMap;
  5. import java.util.Map;
  6.  
  7. import org.apache.lucene.analysis.Analyzer;
  8. import org.apache.lucene.analysis.core.KeywordAnalyzer;
  9. import org.apache.lucene.analysis.core.SimpleAnalyzer;
  10. import org.apache.lucene.analysis.de.GermanAnalyzer;
  11. import org.apache.lucene.analysis.en.EnglishAnalyzer;
  12. import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
  13. import org.apache.lucene.document.Document;
  14. import org.apache.lucene.document.Field;
  15. import org.apache.lucene.document.StringField;
  16. import org.apache.lucene.document.TextField;
  17. import org.apache.lucene.index.DirectoryReader;
  18. import org.apache.lucene.index.IndexWriter;
  19. import org.apache.lucene.index.IndexWriterConfig;
  20. import org.apache.lucene.index.MultiReader;
  21. import org.apache.lucene.queryparser.classic.ParseException;
  22. import org.apache.lucene.queryparser.classic.QueryParser;
  23. import org.apache.lucene.search.BooleanClause;
  24. import org.apache.lucene.search.BooleanQuery;
  25. import org.apache.lucene.search.IndexSearcher;
  26. import org.apache.lucene.search.Query;
  27. import org.apache.lucene.search.ScoreDoc;
  28. import org.apache.lucene.search.WildcardQuery;
  29. import org.apache.lucene.store.Directory;
  30. import org.apache.lucene.store.RAMDirectory;
  31. import org.apache.lucene.util.Version;
  32. import org.junit.Assert;
  33. import org.junit.Test;
  34.  
  35. /**
  36.  * Test searching on the lucene index.
  37.  */
  38. public class TestLuceneIndex
  39. {
  40.     final static String HERSENER = "Hersener";
  41.     final static String BARBARA = "Barbara";
  42.     final static String PK_BARBARA = "19j3bot15djhm8ai";
  43.     final static String PK_HERSENER = "sfnj3ot15djhm8ai";
  44.  
  45.     static QueryParser getParser(Analyzer analyzer)
  46.     {
  47.         // change parser if wanted
  48.         return new QueryParser(Version.LUCENE_40, "CONTENT", analyzer);
  49.     }
  50.  
  51.     @Test
  52.     public void testIndexWithGermanAnalyzer() throws Exception
  53.     {
  54.         PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
  55.         Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
  56.  
  57.         DirectoryReader ireader = DirectoryReader.open(directory);
  58.         MultiReader mr = new MultiReader(ireader);
  59.         IndexSearcher searcher = new IndexSearcher(mr);
  60.  
  61.         checkUsingExactMatch(analyzer, searcher);
  62.         checkUsingExactMatchAndStar(analyzer, searcher);
  63.     }
  64.  
  65.     @Test
  66.     public void testIndexWithSimpleAnalyzer() throws Exception
  67.     {
  68.         PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
  69.         Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
  70.  
  71.         DirectoryReader ireader = DirectoryReader.open(directory);
  72.         MultiReader mr = new MultiReader(ireader);
  73.         IndexSearcher searcher = new IndexSearcher(mr);
  74.  
  75.         checkUsingExactMatch(analyzer, searcher);
  76.         checkUsingExactMatchAndStar(analyzer, searcher);
  77.     }
  78.  
  79.     @Test
  80.     public void testIndexWithGermanAnalyzerStimmingProblem() throws Exception
  81.     {
  82.         PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(1);
  83.         Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
  84.  
  85.         DirectoryReader ireader = DirectoryReader.open(directory);
  86.         MultiReader mr = new MultiReader(ireader);
  87.         IndexSearcher searcher = new IndexSearcher(mr);
  88.  
  89.         checkUsingPartialMatchAndStar(analyzer, searcher);
  90.     }
  91.  
  92.     @Test
  93.     public void testIndexWithSimpleAnalyzerStimmingNoProblem() throws Exception
  94.     {
  95.         PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(3);
  96.         Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
  97.  
  98.         DirectoryReader ireader = DirectoryReader.open(directory);
  99.         MultiReader mr = new MultiReader(ireader);
  100.         IndexSearcher searcher = new IndexSearcher(mr);
  101.  
  102.         checkSimpleAnalyzerOnFamilies(analyzer, searcher);
  103.     }
  104.  
  105.     @Test
  106.     public void testIndexWithEnglishAnalyzerStimmingProblem() throws Exception
  107.     {
  108.         PerFieldAnalyzerWrapper analyzer = TestLuceneIndex.getAnalyzer(2);
  109.         Directory directory = TestLuceneIndex.getAndFillIndex(analyzer);
  110.  
  111.         DirectoryReader ireader = DirectoryReader.open(directory);
  112.         MultiReader mr = new MultiReader(ireader);
  113.         IndexSearcher searcher = new IndexSearcher(mr);
  114.  
  115.         checkSimpleAnalyzerOnFamilies(analyzer, searcher);
  116.     }
  117.  
  118.     /**
  119.      * This works as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
  120.      *
  121.      * @param analyzer          The analyzer used to create the index
  122.      * @param searcher          The IndexSearcher to get the indexed data
  123.      * @throws ParseException
  124.      * @throws IOException
  125.      */
  126.     private void checkUsingExactMatch(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
  127.         throws ParseException, IOException
  128.     {
  129.         QueryParser parser = TestLuceneIndex.getParser(analyzer);
  130.         Query contentQuery = parser.parse(TestLuceneIndex.BARBARA);
  131.         BooleanQuery query = new BooleanQuery();
  132.         query.add(contentQuery, BooleanClause.Occur.MUST);
  133.         ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
  134.         Assert.assertEquals("Test 1: result count", 1, hits.length);
  135.  
  136.         Document hitDoc = searcher.doc(hits[0].doc);
  137.         Assert.assertEquals("Test 2: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
  138.         Assert.assertEquals("Test 3: result keytable content", "Erledigt", hitDoc.get("KEYTABLE_CONTENT"));
  139.         Assert.assertEquals("Test 4: result PKSIMPLE", TestLuceneIndex.PK_BARBARA, hitDoc.get("PKSIMPLE"));
  140.         Assert.assertNull("Test 5: result content", hitDoc.get("CONTENT"));
  141.  
  142.         contentQuery = parser.parse(TestLuceneIndex.HERSENER);
  143.         query = new BooleanQuery();
  144.         query.add(contentQuery, BooleanClause.Occur.MUST);
  145.         hits = searcher.search(query, null, 100).scoreDocs;
  146.         Assert.assertEquals("Test 6: result count", 1, hits.length);
  147.  
  148.         hitDoc = searcher.doc(hits[0].doc);
  149.         Assert.assertEquals("Test 7: result keyrange", "S_ACTSTATUS", hitDoc.get("KEYRANGE"));
  150.         Assert.assertEquals("Test 8: result keytable content", "Nicht begonnen / offen",
  151.             hitDoc.get("KEYTABLE_CONTENT"));
  152.         Assert.assertEquals("Test 9: result PKSIMPLE", TestLuceneIndex.PK_HERSENER, hitDoc.get("PKSIMPLE"));
  153.         Assert.assertNull("Test 10: result content", hitDoc.get("CONTENT"));
  154.     }
  155.  
  156.     /**
  157.      * This works **not** as expected using the {@link GermanAnalyzer}. Using the {@link SimpleAnalyzer} no stemming
  158.      * is done and it works.
  159.      *
  160.      * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
  161.      * The searcher does not stem the given search string when a star is added.
  162.      * The query "Barbara*" will find the indexed string "Barbara" as expected.
  163.      * The query "Hersener*" will **not** find the indexed string "Hersener" as expected when using the {@link GermanAnalyzer}.
  164.      * So Test 2 will fail.
  165.      *
  166.      * @param analyzer          The analyzer used to create the index
  167.      * @param searcher          The IndexSearcher to get the indexed data
  168.      * @throws ParseException
  169.      * @throws IOException
  170.      */
  171.     private void checkUsingExactMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
  172.         throws ParseException, IOException
  173.     {
  174.         QueryParser parser = TestLuceneIndex.getParser(analyzer);
  175.         Query contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
  176.         BooleanQuery query = new BooleanQuery();
  177.         query.add(contentQuery, BooleanClause.Occur.MUST);
  178.         ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
  179.         Assert.assertEquals("Test 1: result count", 1, hits.length);
  180.  
  181.         contentQuery = parser.parse(TestLuceneIndex.BARBARA + "*");
  182.         query = new BooleanQuery();
  183.         query.add(contentQuery, BooleanClause.Occur.MUST);
  184.         hits = searcher.search(query, null, 100).scoreDocs;
  185.         Assert.assertEquals("Test 2: (will fail using GermanAnalyzer but should not) result count", 1,
  186.             hits.length);
  187.     }
  188.  
  189.     /**
  190.      * This works **not** as expected using the {@link GermanAnalyzer} and the {@link SimpleAnalyzer}.
  191.      *
  192.      * We debugged this and it seems the string "Hersener" will be stemmed during indexing to "hers".
  193.      * The string "Barbara" will not be stemmed during indexing.
  194.      * The searcher does not stem the given search string when a star is added.
  195.  
  196.      * The query "Barb*" will find the indexed string "Barbara" as expected.
  197.      * The query "Hers*" will find the indexed string "Hersener" as expected.
  198.      * The query "Barba*" will find the indexed string "Barbara" as expected.
  199.      * The query "Herse*" will **not** find the indexed string "Hersener" as expected.
  200.      * So Test 4 will fail.
  201.      *
  202.      * @param analyzer          The analyzer used to create the index
  203.      * @param searcher          The IndexSearcher to get the indexed data
  204.      * @throws ParseException
  205.      * @throws IOException
  206.      */
  207.     private void checkUsingPartialMatchAndStar(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
  208.         throws ParseException, IOException
  209.     {
  210.         QueryParser parser = TestLuceneIndex.getParser(analyzer);
  211.         Query contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 4) + "*");
  212.         BooleanQuery query = new BooleanQuery();
  213.         query.add(contentQuery, BooleanClause.Occur.MUST);
  214.         ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
  215.         Assert.assertEquals("Test 1: result count", 1, hits.length);
  216.  
  217.         contentQuery = parser.parse(TestLuceneIndex.BARBARA.substring(0, 5) + "*");
  218.         query = new BooleanQuery();
  219.         query.add(contentQuery, BooleanClause.Occur.MUST);
  220.         hits = searcher.search(query, null, 100).scoreDocs;
  221.         Assert.assertEquals("Test 2: result count", 1, hits.length);
  222.  
  223.         contentQuery = parser.parse(TestLuceneIndex.HERSENER.substring(0, 4) + "*");
  224.         query = new BooleanQuery();
  225.         query.add(contentQuery, BooleanClause.Occur.MUST);
  226.         hits = searcher.search(query, null, 100).scoreDocs;
  227.         Assert.assertEquals("Test 3: result count", 1, hits.length);
  228.  
  229.         contentQuery =
  230.  parser.parse(TestLuceneIndex.HERSENER.substring(
  231.                 0, 5) + "*");
  232.         query = new BooleanQuery();
  233.         query.add(contentQuery, BooleanClause.Occur.MUST);
  234.         hits = searcher.search(query, null, 100).scoreDocs;
  235.         Assert.assertEquals("Test 4: (will fail using GermanAnalyzer but should not) result count", 1,
  236.             hits.length);
  237.     }
  238.  
  239.     /**
  240.      * This works **not** as expected using the {@link EnglishAnalyzer}.
  241.      *
  242.      * It seems the string "families" will be stemmed during indexing to "famili".
  243.      * The searcher does not stem the given search string when a star is added, so the string is not found.
  244.  
  245.      * The query "families" will find the indexed string "families" as expected.
  246.      * The query "famil*" will find the indexed string "families" as expected.
  247.      * The query "famili*" will find the indexed string "families" as expected.
  248.      * The query "familie*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
  249.      * The query "families*" will **not** find the indexed string "families" as expected when using the {@link EnglishAnalyzer}.
  250.      * So Test 16 will fail.
  251.      *
  252.      * @param analyzer          The analyzer used to create the index
  253.      * @param searcher          The IndexSearcher to get the indexed data
  254.      * @throws ParseException
  255.      * @throws IOException
  256.      */
  257.     private void checkSimpleAnalyzerOnFamilies(PerFieldAnalyzerWrapper analyzer, IndexSearcher searcher)
  258.         throws ParseException, IOException
  259.     {
  260.         QueryParser parser = TestLuceneIndex.getParser(analyzer);
  261.         Query contentQuery = parser.parse("families");
  262.         BooleanQuery query = new BooleanQuery();
  263.         query.add(contentQuery, BooleanClause.Occur.MUST);
  264.         ScoreDoc[] hits = searcher.search(query, null, 100).scoreDocs;
  265.         Assert.assertEquals("Test 1: result count", 1, hits.length);
  266.  
  267.         contentQuery = parser.parse("famil*");
  268.         query = new BooleanQuery();
  269.         query.add(contentQuery, BooleanClause.Occur.MUST);
  270.         hits = searcher.search(query, null, 100).scoreDocs;
  271.         Assert.assertEquals("Test 2: result count", 1, hits.length);
  272.  
  273.         contentQuery = parser.parse("famili*");
  274.         query = new BooleanQuery();
  275.         query.add(contentQuery, BooleanClause.Occur.MUST);
  276.         hits = searcher.search(query, null, 100).scoreDocs;
  277.         Assert.assertEquals("Test 3: result count", 1, hits.length);
  278.  
  279.         contentQuery = parser.parse("familie*");
  280.         query = new BooleanQuery();
  281.         query.add(contentQuery, BooleanClause.Occur.MUST);
  282.         hits = searcher.search(query, null, 100).scoreDocs;
  283.         Assert.assertEquals("Test 4: (will fail using EnglishAnalyzer but should not) result count", 1,
  284.             hits.length);
  285.  
  286.         contentQuery = parser.parse("families*");
  287.         query = new BooleanQuery();
  288.         query.add(contentQuery, BooleanClause.Occur.MUST);
  289.         hits = searcher.search(query, null, 100).scoreDocs;
  290.         Assert.assertEquals("Test 5: (will fail using EnglishAnalyzer but should not) result count", 1,
  291.             hits.length);
  292.  
  293.     }
  294.  
  295.     private static Directory getAndFillIndex(PerFieldAnalyzerWrapper analyzer) throws IOException
  296.     {
  297.         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
  298.         Directory directory = new RAMDirectory();
  299.         IndexWriter writer = new IndexWriter(directory, config);
  300.         Document doc;
  301.         doc =
  302.             TestLuceneIndex.getDocument("Held " + TestLuceneIndex.BARBARA, "Erledigt", "S_ACTSTATUS",
  303.                 TestLuceneIndex.PK_BARBARA);
  304.         writer.addDocument(doc);
  305.         doc = TestLuceneIndex.getDocument("Hero families", "Erledigt", "S_ACTSTATUS", "19j3bot15djhmwtf");
  306.         writer.addDocument(doc);
  307.         doc =
  308.             TestLuceneIndex.getDocument(TestLuceneIndex.HERSENER, "Nicht begonnen / offen", "S_ACTSTATUS",
  309.                 TestLuceneIndex.PK_HERSENER);
  310.         writer.addDocument(doc);
  311.         doc = TestLuceneIndex.getDocument("Aguaplan", "Erledigt", "S_ACTSTATUS", "wzrt3bot15djhqdtbh");
  312.         writer.addDocument(doc);
  313.         writer.close();
  314.         return directory;
  315.     }
  316.  
  317.     private static Document getDocument(String content, String keytableContent, String keyrange,
  318.         String pkSimple)
  319.     {
  320.         Document doc;
  321.         doc = new Document();
  322.         doc.add(new TextField("CONTENT", content, Field.Store.NO));
  323.         doc.add(new TextField("KEYTABLE_CONTENT", keytableContent, Field.Store.YES));
  324.         doc.add(new StringField("KEYRANGE", keyrange, Field.Store.YES));
  325.         doc.add(new StringField("PKSIMPLE", pkSimple, Field.Store.YES));
  326.         return doc;
  327.     }
  328.  
  329.     /**
  330.      * @param usedAnalyzer {@code true}: The GermanAnalyzer will be used as default analyzer and in the "CONTENT" field
  331.      * @return A wrapper containing the analyzers to be used.
  332.      */
  333.     private static PerFieldAnalyzerWrapper getAnalyzer(int usedAnalyzer)
  334.     {
  335.         Analyzer sim = new SimpleAnalyzer(Version.LUCENE_40);
  336.         Analyzer ger;
  337.         switch (usedAnalyzer)
  338.         {
  339.             case 1 :
  340.                 ger = new GermanAnalyzer(Version.LUCENE_40, GermanAnalyzer.getDefaultStopSet());
  341.                 break;
  342.  
  343.             case 2 :
  344.                 ger = new EnglishAnalyzer(Version.LUCENE_40, EnglishAnalyzer.getDefaultStopSet());
  345.                 break;
  346.             case 3 :
  347.             default :
  348.                 ger = sim;
  349.                 break;
  350.         }
  351.  
  352.         Analyzer key = new KeywordAnalyzer();
  353.  
  354.         Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
  355.         fieldAnalyzers.put("KEYTABLE_CONTENT", sim);
  356.         fieldAnalyzers.put("KEYRANGE", key);
  357.         fieldAnalyzers.put("PKSIMPLE", key);
  358.         fieldAnalyzers.put("CONTENT", ger);
  359.  
  360.         return new PerFieldAnalyzerWrapper(ger, fieldAnalyzers);
  361.     }
  362.  
  363. }
Advertisement
Add Comment
Please, Sign In to add comment