regainConfig

<?xml version="1.0" encoding="ISO-8859-1"?>

<!DOCTYPE configuration [
  <!ENTITY amp "&#x26;">
  <!ENTITY lt "&#x3C;">
  <!ENTITY minus "&#45;">
]>

<!--
 | This file shows the possibilities of the regain configuration.
 |
 | The real crawler configuration is done CrawlerConfiguration.xml. This file
 | holds only examples.
 |
 | You can find a detailed description of all configuration tags here:
 | http://regain.murfman.de/wiki/en/index.php/CrawlerConfiguration.xml
 +-->
<configuration>

<!-- Proxy settings -->
<proxy>
  <!--
  <host>proxy</host>
  <port>3128</port>
  <user>HansWurst</user>
  <password>gkxy23</password>
  -->
</proxy>


<!--
 | The user agent the crawler should use for identifying at the HTTP server(s).
 +-->
<userAgent>Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)</userAgent>


<!-- The list of URLs where the spidering will start. -->
<startlist>
  <!-- Directory parsing -->
  <!--
  <start parse="true" index="false">file://c:/Eigene Dateien</start>
  -->

  <!-- HTML parsing -->
   <start parse="true" index="true">http://atow69.amc.aaa.com</start>

</startlist>


<!-- The whitelist containing prefixes an URL must have to be processed -->
<whitelist>
  <prefix name="file">file://</prefix>
  <prefix>http://atow69.amc.aaa.com</prefix>
</whitelist>


<!-- The blacklist containing prefixes an URL must NOT have to be processed -->
<blacklist>
  <!--
  <prefix>http://www.mydomain.de/some/dynamic/content/</prefix>
  <regex>/backup/[^/]*$</regex>
  -->
  <prefix>http://atow69.amc.aaa.com/cm/</prefix>
  <prefix>http://atow69.amc.aaa.com/webapps/</prefix>
  <prefix>http://atow69.amc.aaa.com/uploadFiles/</prefix>

</blacklist>


<!-- The preferences for the search index -->
<searchIndex>
  <!-- The directory where the index should be located -->
  <dir>searchindex</dir>

  <!-- Specifies, whether the index should be built -->
  <buildIndex>true</buildIndex>

  <!--
   | Specifies the analyzer type to use.
   |
   | You may specify the class name of the analyzer or you use one of the
   | following aliases:
   |  * english: For the english language
   |    (alias for org.apache.lucene.analysis.standard.StandardAnalyzer)
   |  * german: For the german language
   |    (alias for org.apache.lucene.analysis.de.GermanAnalyzer)
   +-->
  <analyzerType>english</analyzerType>

  <!--
   | Sets the maximum number of terms that will be indexed for a single field in
   | a document.
   |
   | If missing or set to -1, then lucene's default will be used (10000).
   +-->
  <maxFieldLength>10000</maxFieldLength>

  <!--
   | Specifies the interval between two breakpoints in minutes. If set to 0, no
   | breakpoints will be created.
   |
   | A breakpoint is a snapshot the crawler creates periodically while working.
   | If the crawler should crash it goes on from the last breakpoint.
   +-->
  <breakpointInterval>10</breakpointInterval>

  <!--
   | Specifies, whether the analysis files should be written.
   | The analysis files help to check the quality of the index building process.
   +-->
  <writeAnalysisFiles>false</writeAnalysisFiles>

  <!--
   | Gibt den maximalen Prozentsatz von gescheiterten Dokumenten an. (0..100)
   |
   | Ist das Verh�ltnis von gescheiterten Dokumenten zur Gesamtzahl von Dokumenten
   | gr��er als dieser Prozentsatz, so wird der Index verworfen.
   |
   | Gescheiterte Dokumente sind Dokumente die es entweder nicht gibt (Deadlink)
   | oder die nicht ausgelesen werden konnten.
   +-->
  <maxFailedDocuments>100</maxFailedDocuments>

  <!--
   | Contains all words that should not be indexed.
   | Separate the words by a blank.
   +-->
  <stopwordList>
    einer eine eines einem einen der die das dass da� du er sie es was wer wie
    wir und oder ohne mit am im in aus auf ist sein war wird ihr ihre ihres als
    f�r von mit dich dir mich mir mein sein kein durch wegen wird
  </stopwordList>
  <!-- italian:
  <stopwordList>
    di a da in con su per tra fra io tu egli ella essa noi voi essi loro che cui
    se e n� anche inoltre neanche o ovvero oppure ma per� eppure anzi invece
    bens� tuttavia quindi dunque perci� pertanto cio� infatti ossia non come
    mentre perch� quando mio mia miei mie tuo tua tuoi tue suo sua suoi sue
    nostro nostre nostri nostre vostro vostre vostri vostre il lo la i gli le un
    uno una degli delle alcuno alcuna alcune qualcuno qualcuna nessuno nessuna
    molto molte molti molte poco parecchio assai
  </stopwordList>
  -->

  <!--
   | Contains all words that should not be changed by an analyser when indexed.
   | Separate the words by a blank.
   +-->
  <exclusionList></exclusionList>

  <!--
   | The names of the fields of which to prefetch the destinct values.
   | Separate the field names by a blank.
   |
   | Put in the names of the fields you use a search:input_fieldlist tag for.
   | The values shown in the list will then be extracted by the crawler and not
   | by the search mask, which prevents a slow first loading of a page for huge
   | indexes.
   +-->
  <valuePrefetchFields>mimetype</valuePrefetchFields>

</searchIndex>


<!--
 | The preparators in the order they should be applied. Preparators that aren't listed
 | here will be applied after the listed ones.
 |
 | You can use this list...
 |   ... to define the priority (= order) of the preparators
 |   ... to disable preparators
 |   ... to configure preparators
 +-->
<preparatorList>
  <preparator>
    <class>.HtmlPreparator</class>
    <!--
     | The regular expression a URL must match to, to be prepared by this
     | preparator. If specified the regular expression used internally by the
     | preparator is overridden.
     +-->
    <!--
    <urlPattern>(^http://[^/]*/?$)|(^http://.*/[^\.]*$)|(^http://.*/$)|(\.(html|htm|jsp|php\d?|asp)$)</urlPattern>
    -->
    <config>
      <!--
       | The regular expressions that find the start and end locations of the content
       | that should be indexed in HTML pages.
       |
       | "prefix":        The prefix the content extractor is responsible for.
       |                  You may specify more content extractors.
       | "startRegex":    The regular expression that finds the start of the
       |                  content. When the content should be indexed from the
       |                  start, specify an empty text for "startRegex".
       | "endRegex":      The regular expression that finds the end of the
       |                  content. When the content should be indexed to the
       |                  end, specify an empty text for "endRegex".
       | "headlineRegex": The regular expression that finds the headline.
       | "headlineRegex.group": The group of "headlineRegex" that extracts the
       |                  headline.
       +-->
      <!--
      <section name="contentExtractor">
        <param name="prefix">http://www.testit.de/</param>
        <param name="startRegex">&lt;!&minus;&minus; Start content &minus;&minus;&gt;</param>
        <param name="endRegex"></param>
        <param name="headlineRegex">&lt;a name=.*&gt;(.*)&lt;/a&gt;</param>
        <param name="headlineRegex.group">1</param>
      </section>
      -->

      <!--
       | The ectractor that extracts the navigation path in HTML pages.
       |
       | "prefix":        The prefix the path extractor is responsible for.
       | "startRegex":    The regular expression that finds the start of the area
       |                  where the whole navigation path is.
       | "endRegex":      The regular expression that finds the end of the area where
       |                  the whole navigation path is.
       | "pathNodeRegex": The regular expression that extracts one node of the
       |                  navigation path.
       | "pathNodeRegex.urlGroup": The group of "pathNodeRegex" that extracts
       |                  the URL of the path node.
       | "pathNodeRegex.titleGroup": The group of "pathNodeRegex" that extracts
       |                  the title of the path node.
       +-->
      <!--
      <section name="pathExtractor">
        <param name="prefix">http://www.testit.de/</param>
        <param name="startRegex">&lt;!&minus;&minus; Pfad-Beginn &minus;&minus;&gt;</param>
        <param name="endRegex">&lt;!&minus;&minus; Pfad-Ende &minus;&minus;&gt;</param>
        <param name="pathNodeRegex">&lt;a.*href="([^"]*)">(.*)&lt;/a></param>
        <param name="pathNodeRegex.urlGroup">1</param>
        <param name="pathNodeRegex.titleGroup">2</param>
      </section>
      -->
    </config>
  </preparator>

  <preparator>
    <class>.PoiMsExcelPreparator</class>
  </preparator>
  <preparator enabled="false">
    <class>.JacobMsExcelPreparator</class>
    <config>
      <!--
       | properties:
       |   The semicolon separated list of document properties that should be
       |   extracted.
       |   Possible properties are:
       |     propTitle, subject, author, keywords, comments, template,
       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
       |     totalEditTime, pages, words, characters, security, category,
       |     manager, company, bytes, lines, paras, hyperlinkBase, charsWSpaces
       +-->
      <section name="main">
        <param name="properties">author</param>
      </section>
    </config>
  </preparator>

  <preparator>
    <class>.PoiMsWordPreparator</class>
  </preparator>
  <preparator enabled="false">
    <class>.JacobMsWordPreparator</class>
    <config>
      <!--
       | headlineStyles:
       |   The semicolon separated list of Word style names (format templates)
       |   used by headline paragraphs.
       |
       | properties:
       |   The semicolon separated list of document properties that should be
       |   extracted.
       |   Possible properties are:
       |     propTitle, subject, author, keywords, comments, template,
       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
       |     totalEditTime, pages, words, characters, security, category,
       |     manager, company, bytes, lines, paras, hyperlinkBase, charsWSpaces
       +-->
      <section name="main">
        <param name="headlineStyles">�berschrift 1;�berschrift 2</param>
        <param name="properties">author</param>
      </section>
    </config>
  </preparator>

  <preparator enabled="false">
    <class>.JacobMsPowerPointPreparator</class>
    <config>
      -->
      <!--
       | properties:
       |   The semicolon separated list of document properties that should be
       |   extracted.
       |   Possible properties are:
       |     propTitle, subject, author, keywords, comments, template,
       |     lastAuthor, revision, timeLastPrinted, timeCreated, timeLastSaved,
       |     totalEditTime, pages, words, characters, security, category,
       |     manager, company, bytes, lines, paras, slides, notes, hiddenSlides,
       |     mmClips, hyperlinkBase, charsWSpaces
       +-->
      <section name="main">
        <param name="properties">author</param>
      </section>
    </config>
  </preparator>

  <preparator enabled="false">
      <class>.IfilterPreparator</class>
  </preparator>

  <!--
   | Add the extensions of those files to the urlPattern of this preparator for
   | which there are no preparators for extracting the content. regain will then
   | add at least the file names and paths to the index.
   +-->
  <preparator>
    <urlPattern>\.(mp3|wav)$</urlPattern>
    <class>.EmptyPreparator</class>
  </preparator>

  <!-- CatchAll-preparator on basis of EmptyPreparator -->
  <preparator priority="-10">
    <class>.EmptyPreparator</class>
    <urlPattern>.*</urlPattern>
  </preparator>

  <preparator>
    <class>.SimpleRtfPreparator</class>
  </preparator>
  <preparator>
    <class>.SwingRtfPreparator</class>
  </preparator>

  <preparator enabled="false">
    <class>.ExternalPreparator</class>
    <config>
      <!--
       | You may specify multiple commands by specifying multiple command
       | sections.
       |
       | urlPattern:
       |   The pattern that matches URLs that should be prepared with this
       |   command.
       |
       | commandLine:
       |   The command line to use for executing the external command.
       |   Before the command is executed ${filename} will be replaced by the
       |   file name.
       |
       | checkExitCode:
       |   Specifies whether the exit code should be checked. Optional. Default
       |   is true.
       +-->
      <section name="command">
        <param name="urlPattern">\.ps$</param>
        <param name="commandLine">ps2ascii ${filename}</param>
        <param name="checkExitCode">false</param>
      </section>
    </config>
  </preparator>
</preparatorList>


<!--
 | The index may be extended with auxiliary fields. These are fields that have
 | been generated from the URL of an document.
 |
 | Example: If you have a directory with a sub directory for every project,
 | then you may create a field with the project's name.
 |
 | The folling tag will create a field "project" with the value "otto23"
 | from the URL "file://c:/projects/otto23/docs/Spez.doc":
 |   <auxiliaryField name="project" regexGroup="1">
 |     <regex>^file://c:/projects/([^/]*)</regex>
 |   </auxiliaryField>
 |
 | URLs that doen't match will get no "project" field.
 |
 | Having done this you may search for "Offer project:otto23" and you will get
 | only hits from this project directory.
 +-->
<auxiliaryFieldList>
  <auxiliaryField name="extension" regexGroup="1" toLowercase="true">
    <regex>\.([^\.]*)$</regex>
  </auxiliaryField>
  <auxiliaryField name="location" regexGroup="1" store="false" tokenize="true">
    <regex>^(.*)$</regex>
  </auxiliaryField>
  <!-- The value would be filled while creating the document in the parser -->
  <auxiliaryField name="mimetype" regexGroup="1" >
    <regex>^()$</regex>
  </auxiliaryField>
</auxiliaryFieldList>


<!-- Specifies, whether to load URLs that are neither parsed nor indexed -->
<loadUnparsedUrls>false</loadUnparsedUrls>


<!--
 | Der Timeout f�r HTTP-Downloads. Dieser Wert bestimmt die maximale Zeit
 | in Sekunden, die ein HTTP-Download insgesamt dauern darf.
 +-->
<httpTimeout>180</httpTimeout>


<!--
 | The list of patterns a document's URL must match to, when the link text
 | should be used as title instead of the document's real title.
 +-->
<useLinkTextAsTitleList>
  <urlPattern>^http://.*\.(pdf|xls|doc|rtf)$</urlPattern>
</useLinkTextAsTitleList>


<!--
 | Specifies which control files should be crated. These files may be used to
 | check with a sceduling script whether the index was successfully built.
 |
 | <finishedWithoutFatalsFile>: The name of the control file that should be
 | created if the index creation finished without fatal errors.
 |
 | <finishedWithFatalsFile>: The name of the control file that should be
 | created if the index creation failed with a fatal error.
 +-->
<!--
<controlFiles>
  <finishedWithoutFatalsFile>c:\Temp\control\NoFatals</finishedWithoutFatalsFile>
  <finishedWithFatalsFile>c:\Temp\control\WithFatals</finishedWithFatalsFile>
</controlFiles>
-->


<!--
 | The CrawlerAccessController to use.
 |
 | This is a part of the access control system that ensures that only those
 | documents are shown in the search results that the user is allowed to
 | read.
 |
 | If you specify a CrawlerAccessController, don't forget to specify the
 | SearchAccessController counterpart in the SearchConfiguration.xml!
 +-->
<!--
<crawlerAccessController>
  <class jar="myAccess.jar">mypackage.MyCrawlerAccessController</class>
  <config>
    <param name="bla">blubb</param>
  </config>
</crawlerAccessController>
-->


<!-- The regular expressions that indentify URLs in HTML. -->
<!-- This configuration part is no longer neccessary -->
<!--htmlParserPatternList>
  <pattern parse="true" index="true" regexGroup="1">="([^"]*(/|htm|html|jsp|php\d?|asp))"</pattern>
  <pattern parse="false" index="false" regexGroup="1">="([^"]*\.(js|css|jpg|gif|png))"</pattern>
  <pattern parse="false" index="true" regexGroup="1">="([^"]*\.[^\."]{3})"</pattern>
</htmlParserPatternList-->


<!--
 | Number for cycle detection in URIs. Values greater than 1 defines that an URI will be rejected
 | if less or more parts in the path of an URI are equal. Example file:///usr/sbin/X11/X11/X11/xconfig
 | will be rejected with MaxCycleCount = 2
-->
<MaxCycleCount>1</MaxCycleCount>


<!--
 | Maximum length of summary in an prepared and indexed document (default 250000).
 | The highlighting of search terms will be created from this summary. The longer
 | the summary, the better the highlighting but also the size of the index. After
 | highlighting the summary will be cut to 200 characters for downward compatibility
 | of the hit documents.
-->
<MaxSummaryLength>1000000</MaxSummaryLength>

</configuration>