Advertisement
Guest User

regainConfig

a guest
Mar 12th, 2009
211
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
XML 16.94 KB | None | 0 0
  1. <?xml version="1.0" encoding="ISO-8859-1"?>
  2.  
  3. <!DOCTYPE configuration [
  4.  <!ENTITY amp "&#x26;">
  5.   <!ENTITY lt "&#x3C;">
  6.   <!ENTITY minus "&#45;">
  7. ]>
  8.  
  9. <!--
  10. | This file shows the possibilities of the regain configuration.
  11. |
  12. | The real crawler configuration is done CrawlerConfiguration.xml. This file
  13. | holds only examples.
  14. |
  15. | You can find a detailed description of all configuration tags here:
  16. | http://regain.murfman.de/wiki/en/index.php/CrawlerConfiguration.xml
  17. +-->
  18. <configuration>
  19.  
  20. <!-- Proxy settings -->
  21. <proxy>
  22.   <!--
  23.  <host>proxy</host>
  24.  <port>3128</port>
  25.  <user>HansWurst</user>
  26.  <password>gkxy23</password>
  27.  -->
  28. </proxy>
  29.  
  30.  
  31. <!--
  32. | The user agent the crawler should use for identifying at the HTTP server(s).
  33. +-->
  34. <userAgent>Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)</userAgent>
  35.  
  36.  
  37. <!-- The list of URLs where the spidering will start. -->
  38. <startlist>
  39.   <!-- Directory parsing -->
  40.   <!--
  41.  <start parse="true" index="false">file://c:/Eigene Dateien</start>
  42.  -->
  43.  
  44.   <!-- HTML parsing -->
  45.    <start parse="true" index="true">http://atow69.amc.aaa.com</start>
  46.    
  47. </startlist>
  48.  
  49.  
  50. <!-- The whitelist containing prefixes an URL must have to be processed -->
  51. <whitelist>
  52.   <prefix name="file">file://</prefix>
  53.   <prefix>http://atow69.amc.aaa.com</prefix>
  54. </whitelist>
  55.  
  56.  
  57. <!-- The blacklist containing prefixes an URL must NOT have to be processed -->
  58. <blacklist>
  59.   <!--
  60.  <prefix>http://www.mydomain.de/some/dynamic/content/</prefix>
  61.  <regex>/backup/[^/]*$</regex>
  62.  -->
  63.   <prefix>http://atow69.amc.aaa.com/cm/</prefix>  
  64.   <prefix>http://atow69.amc.aaa.com/webapps/</prefix>
  65.   <prefix>http://atow69.amc.aaa.com/uploadFiles/</prefix>
  66.  
  67. </blacklist>
  68.  
  69.  
  70. <!-- The preferences for the search index -->
  71. <searchIndex>
  72.   <!-- The directory where the index should be located -->
  73.   <dir>searchindex</dir>
  74.  
  75.   <!-- Specifies, whether the index should be built -->
  76.   <buildIndex>true</buildIndex>
  77.  
  78.   <!--
  79.   | Specifies the analyzer type to use.
  80.   |
  81.   | You may specify the class name of the analyzer or you use one of the
  82.   | following aliases:
  83.   |  * english: For the english language
  84.   |    (alias for org.apache.lucene.analysis.standard.StandardAnalyzer)
  85.   |  * german: For the german language
  86.   |    (alias for org.apache.lucene.analysis.de.GermanAnalyzer)
  87.   +-->
  88.   <analyzerType>english</analyzerType>
  89.  
  90.   <!--
  91.   | Sets the maximum number of terms that will be indexed for a single field in
  92.   | a document.
  93.   |
  94.   | If missing or set to -1, then lucene's default will be used (10000).
  95.   +-->
  96.   <maxFieldLength>10000</maxFieldLength>
  97.  
  98.   <!--
  99.   | Specifies the interval between two breakpoints in minutes. If set to 0, no
  100.   | breakpoints will be created.
  101.   |
  102.   | A breakpoint is a snapshot the crawler creates periodically while working.
  103.   | If the crawler should crash it goes on from the last breakpoint.
  104.   +-->
  105.   <breakpointInterval>10</breakpointInterval>
  106.  
  107.   <!--
  108.   | Specifies, whether the analysis files should be written.
  109.   | The analysis files help to check the quality of the index building process.
  110.   +-->
  111.   <writeAnalysisFiles>false</writeAnalysisFiles>
  112.  
  113.   <!--
  114.   | Gibt den maximalen Prozentsatz von gescheiterten Dokumenten an. (0..100)
  115.   |
  116.   | Ist das Verhältnis von gescheiterten Dokumenten zur Gesamtzahl von Dokumenten
  117.   | größer als dieser Prozentsatz, so wird der Index verworfen.
  118.   |
  119.   | Gescheiterte Dokumente sind Dokumente die es entweder nicht gibt (Deadlink)
  120.   | oder die nicht ausgelesen werden konnten.
  121.   +-->
  122.   <maxFailedDocuments>100</maxFailedDocuments>
  123.  
  124.   <!--
  125.   | Contains all words that should not be indexed.
  126.   | Separate the words by a blank.
  127.   +-->
  128.   <stopwordList>
  129.     einer eine eines einem einen der die das dass daß du er sie es was wer wie
  130.     wir und oder ohne mit am im in aus auf ist sein war wird ihr ihre ihres als
  131.     für von mit dich dir mich mir mein sein kein durch wegen wird
  132.   </stopwordList>
  133.   <!-- italian:
  134.  <stopwordList>
  135.    di a da in con su per tra fra io tu egli ella essa noi voi essi loro che cui
  136.    se e né anche inoltre neanche o ovvero oppure ma però eppure anzi invece
  137.    bensì tuttavia quindi dunque perciò pertanto cioè infatti ossia non come
  138.    mentre perché quando mio mia miei mie tuo tua tuoi tue suo sua suoi sue
  139.    nostro nostre nostri nostre vostro vostre vostri vostre il lo la i gli le un
  140.    uno una degli delle alcuno alcuna alcune qualcuno qualcuna nessuno nessuna
  141.    molto molte molti molte poco parecchio assai
  142.  </stopwordList>
  143.  -->
  144.  
  145.   <!--
  146.   | Contains all words that should not be changed by an analyser when indexed.
  147.   | Separate the words by a blank.
  148.   +-->
  149.   <exclusionList></exclusionList>
  150.  
  151.   <!--
  152.   | The names of the fields of which to prefetch the destinct values.
  153.   | Separate the field names by a blank.
  154.   |
  155.   | Put in the names of the fields you use a search:input_fieldlist tag for.
  156.   | The values shown in the list will then be extracted by the crawler and not
  157.   | by the search mask, which prevents a slow first loading of a page for huge
  158.   | indexes.
  159.   +-->
  160.   <valuePrefetchFields>mimetype</valuePrefetchFields>
  161.  
  162. </searchIndex>
  163.  
  164.  
  165. <!--
  166. | The preparators in the order they should be applied. Preparators that aren't listed
  167. | here will be applied after the listed ones.
  168. |
  169. | You can use this list...
  170. |   ... to define the priority (= order) of the preparators
  171. |   ... to disable preparators
  172. |   ... to configure preparators
  173. +-->
  174. <preparatorList>
  175.   <preparator>
  176.     <class>.HtmlPreparator</class>
  177.     <!--
  178.     | The regular expression a URL must match to, to be prepared by this
  179.     | preparator. If specified the regular expression used internally by the
  180.     | preparator is overridden.
  181.     +-->
  182.     <!--
  183.    <urlPattern>(^http://[^/]*/?$)|(^http://.*/[^\.]*$)|(^http://.*/$)|(\.(html|htm|jsp|php\d?|asp)$)</urlPattern>
  184.    -->
  185.     <config>
  186.       <!--
  187.       | The regular expressions that find the start and end locations of the content
  188.       | that should be indexed in HTML pages.
  189.       |
  190.       | "prefix":        The prefix the content extractor is responsible for.
  191.       |                  You may specify more content extractors.
  192.       | "startRegex":    The regular expression that finds the start of the
  193.       |                  content. When the content should be indexed from the
  194.       |                  start, specify an empty text for "startRegex".
  195.       | "endRegex":      The regular expression that finds the end of the
  196.       |                  content. When the content should be indexed to the
  197.       |                  end, specify an empty text for "endRegex".
  198.       | "headlineRegex": The regular expression that finds the headline.
  199.       | "headlineRegex.group": The group of "headlineRegex" that extracts the
  200.       |                  headline.
  201.       +-->
  202.       <!--
  203.      <section name="contentExtractor">
  204.        <param name="prefix">http://www.testit.de/</param>
  205.        <param name="startRegex">&lt;!&minus;&minus; Start content &minus;&minus;&gt;</param>
  206.        <param name="endRegex"></param>
  207.        <param name="headlineRegex">&lt;a name=.*&gt;(.*)&lt;/a&gt;</param>
  208.        <param name="headlineRegex.group">1</param>
  209.      </section>
  210.      -->
  211.  
  212.       <!--
  213.       | The ectractor that extracts the navigation path in HTML pages.
  214.       |
  215.       | "prefix":        The prefix the path extractor is responsible for.
  216.       | "startRegex":    The regular expression that finds the start of the area
  217.       |                  where the whole navigation path is.
  218.       | "endRegex":      The regular expression that finds the end of the area where
  219.       |                  the whole navigation path is.
  220.       | "pathNodeRegex": The regular expression that extracts one node of the
  221.       |                  navigation path.
  222.       | "pathNodeRegex.urlGroup": The group of "pathNodeRegex" that extracts
  223.       |                  the URL of the path node.
  224.       | "pathNodeRegex.titleGroup": The group of "pathNodeRegex" that extracts
  225.       |                  the title of the path node.
  226.       +-->
  227.       <!--
  228.      <section name="pathExtractor">
  229.        <param name="prefix">http://www.testit.de/</param>
  230.        <param name="startRegex">&lt;!&minus;&minus; Pfad-Beginn &minus;&minus;&gt;</param>
  231.        <param name="endRegex">&lt;!&minus;&minus; Pfad-Ende &minus;&minus;&gt;</param>
  232.        <param name="pathNodeRegex">&lt;a.*href="([^"]*)">(.*)&lt;/a></param>
  233.        <param name="pathNodeRegex.urlGroup">1</param>
  234.        <param name="pathNodeRegex.titleGroup">2</param>
  235.      </section>
  236.      -->
  237.     </config>
  238.   </preparator>
  239.  
  240.   <preparator>
  241.     <class>.PoiMsExcelPreparator</class>
  242.   </preparator>
  243.   <preparator enabled="false">
  244.     <class>.JacobMsExcelPreparator</class>
  245.     <config>
  246.       <!--
  247.       | properties:
  248.       |   The semicolon separated list of document properties that should be
  249.       |   extracted.
  250.       |   Possible properties are:
  251.       |     propTitle, subject, author, keywords, comments, template,
  252.       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
  253.       |     totalEditTime, pages, words, characters, security, category,
  254.       |     manager, company, bytes, lines, paras, hyperlinkBase, charsWSpaces
  255.       +-->
  256.       <section name="main">
  257.         <param name="properties">author</param>
  258.       </section>
  259.     </config>
  260.   </preparator>
  261.  
  262.   <preparator>
  263.     <class>.PoiMsWordPreparator</class>
  264.   </preparator>
  265.   <preparator enabled="false">
  266.     <class>.JacobMsWordPreparator</class>
  267.     <config>
  268.       <!--
  269.       | headlineStyles:
  270.       |   The semicolon separated list of Word style names (format templates)
  271.       |   used by headline paragraphs.
  272.       |
  273.       | properties:
  274.       |   The semicolon separated list of document properties that should be
  275.       |   extracted.
  276.       |   Possible properties are:
  277.       |     propTitle, subject, author, keywords, comments, template,
  278.       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
  279.       |     totalEditTime, pages, words, characters, security, category,
  280.       |     manager, company, bytes, lines, paras, hyperlinkBase, charsWSpaces
  281.       +-->
  282.       <section name="main">
  283.         <param name="headlineStyles">Überschrift 1;Überschrift 2</param>
  284.         <param name="properties">author</param>
  285.       </section>
  286.     </config>
  287.   </preparator>
  288.  
  289.   <preparator enabled="false">
  290.     <class>.JacobMsPowerPointPreparator</class>
  291.     <config>
  292.       -->
  293.       <!--
  294.       | properties:
  295.       |   The semicolon separated list of document properties that should be
  296.       |   extracted.
  297.       |   Possible properties are:
  298.       |     propTitle, subject, author, keywords, comments, template,
  299.       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
  300.       |     totalEditTime, pages, words, characters, security, category,
  301.       |     manager, company, bytes, lines, paras, slides, notes, hiddenSlides,
  302.       |     mmClips, hyperlinkBase, charsWSpaces
  303.       +-->
  304.       <section name="main">
  305.         <param name="properties">author</param>
  306.       </section>
  307.     </config>    
  308.   </preparator>
  309.  
  310.   <preparator enabled="false">
  311.       <class>.IfilterPreparator</class>
  312.   </preparator>
  313.  
  314.   <!--
  315.   | Add the extensions of those files to the urlPattern of this preparator for
  316.   | which there are no preparators for extracting the content. regain will then
  317.   | add at least the file names and paths to the index.
  318.   +-->
  319.   <preparator>
  320.     <urlPattern>\.(mp3|wav)$</urlPattern>
  321.     <class>.EmptyPreparator</class>
  322.   </preparator>
  323.  
  324.   <!-- CatchAll-preparator on basis of EmptyPreparator -->
  325.   <preparator priority="-10">
  326.     <class>.EmptyPreparator</class>
  327.     <urlPattern>.*</urlPattern>
  328.   </preparator>
  329.  
  330.   <preparator>
  331.     <class>.SimpleRtfPreparator</class>
  332.   </preparator>
  333.   <preparator>
  334.     <class>.SwingRtfPreparator</class>
  335.   </preparator>
  336.  
  337.   <preparator enabled="false">
  338.     <class>.ExternalPreparator</class>
  339.     <config>
  340.       <!--
  341.       | You may specify multiple commands by specifying multiple command
  342.       | sections.
  343.       |
  344.       | urlPattern:
  345.       |   The pattern that matches URLs that should be prepared with this
  346.       |   command.
  347.       |
  348.       | commandLine:
  349.       |   The command line to use for executing the external command.
  350.       |   Before the command is executed ${filename} will be replaced by the
  351.       |   file name.
  352.       |
  353.       | checkExitCode:
  354.       |   Specifies whether the exit code should be checked. Optional. Default
  355.       |   is true.
  356.       +-->
  357.       <section name="command">
  358.         <param name="urlPattern">\.ps$</param>
  359.         <param name="commandLine">ps2ascii ${filename}</param>
  360.         <param name="checkExitCode">false</param>
  361.       </section>
  362.     </config>
  363.   </preparator>
  364. </preparatorList>
  365.  
  366.  
  367. <!--
  368. | The index may be extended with auxiliary fields. These are fields that have
  369. | been generated from the URL of an document.
  370. |
  371. | Example: If you have a directory with a sub directory for every project,
  372. | then you may create a field with the project's name.
  373. |
  374. | The folling tag will create a field "project" with the value "otto23"
  375. | from the URL "file://c:/projects/otto23/docs/Spez.doc":
  376. |   <auxiliaryField name="project" regexGroup="1">
  377. |     <regex>^file://c:/projects/([^/]*)</regex>
  378. |   </auxiliaryField>
  379. |
  380. | URLs that doen't match will get no "project" field.
  381. |
  382. | Having done this you may search for "Offer project:otto23" and you will get
  383. | only hits from this project directory.
  384. +-->
  385. <auxiliaryFieldList>
  386.   <auxiliaryField name="extension" regexGroup="1" toLowercase="true">
  387.     <regex>\.([^\.]*)$</regex>
  388.   </auxiliaryField>
  389.   <auxiliaryField name="location" regexGroup="1" store="false" tokenize="true">
  390.     <regex>^(.*)$</regex>
  391.   </auxiliaryField>
  392.   <!-- The value would be filled while creating the document in the parser -->
  393.   <auxiliaryField name="mimetype" regexGroup="1" >
  394.     <regex>^()$</regex>
  395.   </auxiliaryField>
  396. </auxiliaryFieldList>
  397.  
  398.  
  399. <!-- Specifies, whether to load URLs that are neither parsed nor indexed -->
  400. <loadUnparsedUrls>false</loadUnparsedUrls>
  401.  
  402.  
  403. <!--
  404. | Der Timeout für HTTP-Downloads. Dieser Wert bestimmt die maximale Zeit
  405. | in Sekunden, die ein HTTP-Download insgesamt dauern darf.
  406. +-->
  407. <httpTimeout>180</httpTimeout>
  408.  
  409.  
  410. <!--
  411. | The list of patterns a document's URL must match to, when the link text
  412. | should be used as title instead of the document's real title.
  413. +-->
  414. <useLinkTextAsTitleList>
  415.   <urlPattern>^http://.*\.(pdf|xls|doc|rtf)$</urlPattern>
  416. </useLinkTextAsTitleList>
  417.  
  418.  
  419. <!--
  420. | Specifies which control files should be crated. These files may be used to
  421. | check with a sceduling script whether the index was successfully built.
  422. |
  423. | <finishedWithoutFatalsFile>: The name of the control file that should be
  424. | created if the index creation finished without fatal errors.
  425. |
  426. | <finishedWithFatalsFile>: The name of the control file that should be
  427. | created if the index creation failed with a fatal error.
  428. +-->
  429. <!--
  430. <controlFiles>
  431.  <finishedWithoutFatalsFile>c:\Temp\control\NoFatals</finishedWithoutFatalsFile>
  432.  <finishedWithFatalsFile>c:\Temp\control\WithFatals</finishedWithFatalsFile>
  433. </controlFiles>
  434. -->
  435.  
  436.  
  437. <!--
  438. | The CrawlerAccessController to use.
  439. |
  440. | This is a part of the access control system that ensures that only those
  441. | documents are shown in the search results that the user is allowed to
  442. | read.
  443. |
  444. | If you specify a CrawlerAccessController, don't forget to specify the
  445. | SearchAccessController counterpart in the SearchConfiguration.xml!
  446. +-->
  447. <!--
  448. <crawlerAccessController>
  449.  <class jar="myAccess.jar">mypackage.MyCrawlerAccessController</class>
  450.  <config>
  451.    <param name="bla">blubb</param>
  452.  </config>
  453. </crawlerAccessController>
  454. -->
  455.  
  456.  
  457. <!-- The regular expressions that indentify URLs in HTML. -->
  458. <!-- This configuration part is no longer neccessary -->
  459. <!--htmlParserPatternList>
  460.  <pattern parse="true" index="true" regexGroup="1">="([^"]*(/|htm|html|jsp|php\d?|asp))"</pattern>
  461.  <pattern parse="false" index="false" regexGroup="1">="([^"]*\.(js|css|jpg|gif|png))"</pattern>
  462.  <pattern parse="false" index="true" regexGroup="1">="([^"]*\.[^\."]{3})"</pattern>
  463. </htmlParserPatternList-->
  464.  
  465.  
  466. <!--
  467. | Number for cycle detection in URIs. Values greater than 1 defines that an URI will be rejected
  468. | if less or more parts in the path of an URI are equal. Example file:///usr/sbin/X11/X11/X11/xconfig
  469. | will be rejected with MaxCycleCount = 2
  470. -->
  471. <MaxCycleCount>1</MaxCycleCount>
  472.  
  473.  
  474. <!--
  475. | Maximum length of summary in an prepared and indexed document (default 250000).
  476. | The highlighting of search terms will be created from this summary. The longer
  477. | the summary, the better the highlighting but also the size of the index. After
  478. | highlighting the summary will be cut to 200 characters for downward compatibility
  479. | of the hit documents.
  480. -->
  481. <MaxSummaryLength>1000000</MaxSummaryLength>
  482.  
  483. </configuration>
  484.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement