Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?xml version="1.0" encoding="UTF-8" ?>
- <schema name="nutch" version="1.4">
- <types>
- <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
- <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
- <!--
- Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
- -->
- <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
- <!--
- Numeric field types that index each value at various levels of precision
- to accelerate range queries when the number of values between the range
- endpoints is large. See the javadoc for NumericRangeQuery for internal
- implementation details.
- Smaller precisionStep values (specified in bits) will lead to more tokens
- indexed per value, slightly larger index size, and faster range queries.
- A precisionStep of 0 disables indexing at different precision levels.
- -->
- <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
- <fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
- <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
- is a more restricted form of the canonical representation of dateTime
- http://www.w3.org/TR/xmlschema-2/#dateTime
- The trailing "Z" designates UTC time and is mandatory.
- Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
- All other components are mandatory.
- Expressions can also be used to denote calculations that should be
- performed relative to "NOW" to determine the value, ie...
- NOW/HOUR
- ... Round to the start of the current hour
- NOW-1DAY
- ... Exactly 1 day prior to now
- NOW/DAY+6MONTHS+3DAYS
- ... 6 months and 3 days in the future from the start of
- the current day
- Consult the DateField javadocs for more information.
- Note: For faster range queries, consider the tdate type
- -->
- <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
- <!-- A Trie based date field for faster date range queries and date faceting. -->
- <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
- <!-- solr.TextField allows the specification of custom text analyzers
- specified as a tokenizer and a list of token filters. Different
- analyzers may be specified for indexing and querying.
- The optional positionIncrementGap puts space between multiple fields of
- this type on the same document, with the purpose of preventing false phrase
- matching across fields.
- For more info on customizing your analyzer chain, please see
- http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
- -->
- <!-- A general text field that has reasonable, generic
- cross-language defaults: it tokenizes with StandardTokenizer,
- removes stop words from case-insensitive "stopwords.txt"
- (empty by default), and down cases. At query time only, it
- also applies synonyms. -->
- <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
- <analyzer type="index">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
- <filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
- <!-- A text field with defaults appropriate for English: it
- tokenizes with StandardTokenizer, removes English stop words
- (stopwords.txt), down cases, protects words from protwords.txt, and
- finally applies Porter's stemming. The query time analyzer
- also applies synonyms from synonyms.txt. -->
- <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPossessiveFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
- <filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.EnglishPossessiveFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <charFilter class="solr.HTMLStripCharFilterFactory"/>
- </analyzer>
- </fieldType>
- <!-- A text field with defaults appropriate for English, plus
- aggressive word-splitting and autophrase features enabled.
- This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
- words on case-change, alpha numeric boundaries, and
- non-alphanumeric chars. This means certain compound word
- cases will work, for example query "wi fi" will match
- document "WiFi" or "wi-fi". However, other cases will still
- not match, for example if the query is "wifi" and the
- document is "wi fi" or if the query is "wi-fi" and the
- document is "wifi".
- -->
- <fieldType name="text_en_splitting" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="1"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
- <filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
- <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
- but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
- <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
- <filter class="solr.EnglishMinimalStemFilterFactory"/>
- <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
- <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- </analyzer>
- </fieldType>
- <!-- Just like text_general except it reverses the characters of
- each token, to enable more efficient leading wildcard queries. -->
- <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
- <analyzer type="index">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.ReversedWildcardFilterFactory" maxPosQuestion="2" maxFractionAsterisk="0.33" maxPosAsterisk="3" withOriginal="true"/>
- </analyzer>
- <analyzer type="query">
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
- <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
- <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
- </analyzer>
- </fieldtype>
- <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <!--
- The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
- a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
- Attributes of the DelimitedPayloadTokenFilterFactory :
- "delimiter" - a one character delimiter. Default is | (pipe)
- "encoder" - how to encode the following value into a playload
- float -> org.apache.lucene.analysis.payloads.FloatEncoder,
- integer -> o.a.l.a.p.IntegerEncoder
- identity -> o.a.l.a.p.IdentityEncoder
- Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
- -->
- <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
- </analyzer>
- </fieldtype>
- <!-- lowercases the entire field value, keeping it as a single token. -->
- <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.KeywordTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory" />
- </analyzer>
- </fieldType>
- <fieldType name="url" class="solr.TextField" positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"/>
- </analyzer>
- </fieldType>
- <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
- <analyzer>
- <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
- </analyzer>
- </fieldType>
- <!-- since fields of this type are by default not stored or indexed,
- any data added to them will be ignored outright. -->
- <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
- <fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
- <fieldType name="point" class="solr.PointType" subFieldSuffix="_d" dimension="2"/>
- <fieldType name="random" class="solr.RandomSortField" indexed="true"/>
- <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true"/>
- <fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
- </types>
- <fields>
- <field name="id" type="string" stored="true" indexed="true"/>
- <!-- xpath filter fields -->
- <field name="articleTitle" type="text" stored="true" indexed="true" />
- <field name="articleAuthor" type="text" stored="true" indexed="true" />
- <field name="articleContent" type="text" stored="true" indexed="true" />
- <!-- core fields -->
- <field name="segment" type="string" stored="true" indexed="false"/>
- <field name="digest" type="string" stored="true" indexed="false"/>
- <field name="boost" type="float" stored="true" indexed="false"/>
- <!-- fields for index-basic plugin -->
- <field name="host" type="url" stored="false" indexed="true"/>
- <field name="site" type="string" stored="false" indexed="true"/>
- <field name="url" type="url" stored="true" indexed="true"
- required="true"/>
- <field name="content" type="text" stored="false" indexed="true"/>
- <field name="title" type="text" stored="true" indexed="true"/>
- <field name="cache" type="string" stored="true" indexed="false"/>
- <field name="tstamp" type="date" stored="true" indexed="false"/>
- <!-- fields for index-anchor plugin -->
- <field name="anchor" type="string" stored="true" indexed="true"
- multiValued="true"/>
- <!-- fields for index-more plugin -->
- <field name="type" type="string" stored="true" indexed="true"
- multiValued="true"/>
- <field name="contentLength" type="long" stored="true"
- indexed="false"/>
- <field name="lastModified" type="date" stored="true"
- indexed="false"/>
- <field name="date" type="date" stored="true" indexed="true"/>
- <!-- fields for languageidentifier plugin -->
- <field name="lang" type="string" stored="true" indexed="true"/>
- <!-- fields for subcollection plugin -->
- <field name="subcollection" type="string" stored="true"
- indexed="true" multiValued="true"/>
- <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
- <field name="author" type="string" stored="true" indexed="true"/>
- <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
- <field name="feed" type="string" stored="true" indexed="true"/>
- <field name="publishedDate" type="date" stored="true"
- indexed="true"/>
- <field name="updatedDate" type="date" stored="true"
- indexed="true"/>
- <!-- fields for creativecommons plugin -->
- <field name="cc" type="string" stored="true" indexed="true"
- multiValued="true"/>
- <field name="_version_" type="long" indexed="true" stored="true" />
- </fields>
- <uniqueKey>id</uniqueKey>
- <defaultSearchField>articleContent</defaultSearchField>
- <solrQueryParser defaultOperator="OR"/>
- </schema>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement