以下是针对schema.xml 配置文件的剖析:
1. <types></types>这个标签和它的意义一样,是用来表示数据有哪些类型,这些类型当然是solr内部定义的类型和自定义类型。
2. <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
和他上面解释一样,string类型是不分词的,要建索引,要存储
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
3.数值类型,有如下几个类型是默认数值类型,如果想用于排序请用 tint/tfloat/tlong/tdouble类型
<!--Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
-->
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
4.时间类型:如果想用于快速排序查询,用tdate(看到这里我的排序没用tdate,得改啊。。)
Note: For faster range queries, consider the tdate type
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<!-- A Trie based date field for faster date range queries and date faceting. --> <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
5.专门用于分词的字段。在里面包含了定义使用什么分词器,可以手工定制。
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<!--建索引用到的分词器,我用的是IKTokenizerFactory,中文分词-->
<analyzer type="index">
<!--<tokenizer class="solr.WhitespaceTokenizerFactory"/>--> <tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory"/> <!-- in this example, we will only use synonyms at query time <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> --> <!-- Case insensitive stop word removal. add enablePositionIncrements=true in both the index and query analyzers to leave a 'gap' for more accurate phrase queries. --> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.PorterStemFilterFactory"/> </analyzer><!-- <analyzer type="query">
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/>--> <tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.PorterStemFilterFactory"/> </analyzer><!--查询用到的分词器IKTokenizerFactory,为了支持中文查询,这是必须的。-->
<analyzer type="query">
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/>--> <tokenizer class="org.wltea.analyzer.solr.IKTokenizerFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.PorterStemFilterFactory"/> </analyzer> </fieldType>
其他几个类别都是不常用的,也是通过分词器来定义不同的类别。和第五个类似。
6.索引字段名称定义。
<fields>
<!-- Valid attributes for fields: name: mandatory - the name for the field type: mandatory - the name of a previously defined type from the <types> section indexed: true if this field should be indexed (searchable or sortable) stored: true if this field should be retrievable multiValued: true if this field may contain multiple values per document omitNorms: (expert) set to true to omit the norms associated with this field (this disables length normalization and index-time boosting for the field, and saves some memory). Only full-text fields or fields that need an index-time boost need norms. termVectors: [false] set to true to store the term vector for a given field. When using MoreLikeThis, fields used for similarity should be stored for best performance. termPositions: Store position information with the term vector. This will increase storage costs. termOffsets: Store offset information with the term vector. This will increase storage costs. default: a value that should be used if no value is specified when adding a document. --> <field name="id" type="string" indexed="true" stored="true" required="true" /> <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/> <!-- <field name="name" type="textgen" indexed="true" stored="true"/>--> <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/> <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/> <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/> <field name="features" type="text" indexed="true" stored="true" multiValued="true"/> <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> <field name="weight" type="float" indexed="true" stored="true"/> <!-- <field name="price" type="float" indexed="true" stored="true"/>--> <field name="popularity" type="int" indexed="true" stored="true" /> <field name="inStock" type="boolean" indexed="true" stored="true" /> </fields>
id:是索引字段的唯一标识。
termVectors="true"属性主要用于相关搜索。
multiValued="true"属性,一般用于多个字段组成一个字段的情况。
<field name="content_type" type="text" indexed="true" stored="true" multiValued="true"/>
一般用于查询的字段定义为multiValued。
7. <dynamicField name="*_i" type="int" indexed="true" stored="true"/>表示动态字段,暂时没用到。