首页 \ 教程 \ solr

知识点

Solr

lucene的使用

solr4.7配置（ik-analyzer）

IK Analyzer 中文分词器

初识Lucene

Lucene vs Solr

Solr学习(2) Solr4.2.0+IK Analyzer 2012

基于lucene的全文搜索，才开始搞lucene，好像还有个基于lucene的solr

[置顶] Solr学习之十二：IK Analyzer中文分词配置

Lucene查询语法

Lucene/Solr开发经验

【转载】基于lucene的搜索方案

Scaling Lucene and Solr

理解Lucene/Solr的缓存

在solr4.x上使用IK分词设置useSmart没有效果的解决方案

lucene开发序幕曲之luke神器

基于Lucene 4.x的ik-analyzer

2019-03-27 01:03|来源: 网路

需要修改IKAnalyzer.java、IKTokenizer.java、IKTokenizerFactory.java。

1 import java.io.Reader;
2 import org.apache.lucene.analysis.Analyzer;
3 import org.apache.lucene.analysis.Tokenizer;
4
5 /**
6 * 实现Lucene Analyzer 基于IKTokenizer的中文分词器
7 *
8 * @author 林良益
9 *
10 */
11 public final class IKAnalyzer extends Analyzer {
12
13      private boolean isMaxWordLength = false;
14
15      /**
16      * IK分词器Lucene Analyzer接口实现类默认最细粒度切分算法
17       */
18      public IKAnalyzer() {
19          this( false);
20     }
21
22      /**
23      * IK分词器Lucene Analyzer接口实现类
24      *
25      * @param isMaxWordLength
26      *            当为true时，分词器进行最大词长切分
27       */
28      public IKAnalyzer( boolean isMaxWordLength) {
29          super();
30          this.setMaxWordLength(isMaxWordLength);
31     }
32
33     @Override
34      public TokenStreamComponents createComponents(String fieldName,
35             Reader reader) {
36         Tokenizer tokenizer = new IKTokenizer(reader, isMaxWordLength());
37          return new TokenStreamComponents(tokenizer, null);
38     }
39
40      public void setMaxWordLength( boolean isMaxWordLength) {
41          this.isMaxWordLength = isMaxWordLength;
42     }
43
44      public boolean isMaxWordLength() {
45          return isMaxWordLength;
46     }
47
48 }

1 import java.io.IOException;
2 import java.io.Reader;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
6 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
7 import org.wltea.analyzer.IKSegmentation;
8 import org.wltea.analyzer.Lexeme;
9
10 /**
11 * IK Analyzer v3.2 Lucene4.x Tokenizer适配器类它封装了IKSegmentation实现
12 *
13 * @author 林良益
14 *
15 */
16 public final class IKTokenizer extends Tokenizer {
17      // IK分词器实现
18      private IKSegmentation _IKImplement;
19      // 词元文本属性
20      private CharTermAttribute termAtt;
21      // 词元位移属性
22      private OffsetAttribute offsetAtt;
23      // 记录最后一个词元的结束位置
24      private int finalOffset;
25
26      /**
27      * Lucene Tokenizer适配器类构造函数
28      *
29      * @param in
30      * @param isMaxWordLength
31      *            当为true时，分词器进行最大词长切分；当为false是，采用最细粒度切分
32       */
33      public IKTokenizer(Reader in, boolean isMaxWordLength) {
34          super(in);
35         offsetAtt = addAttribute(OffsetAttribute. class);
36         termAtt = addAttribute(CharTermAttribute. class);
37         _IKImplement = new IKSegmentation(in, isMaxWordLength);
38     }
39
40     @Override
41      public final boolean incrementToken() throws IOException {
42          // 清除所有的词元属性
43         clearAttributes();
44         Lexeme nextLexeme = _IKImplement.next();
45          if (nextLexeme != null) {
46              // 将Lexeme转成Attributes
47              // 设置词元文本
48             termAtt.setEmpty().append(nextLexeme.getLexemeText());
49              // 设置词元位移
50             offsetAtt.setOffset(nextLexeme.getBeginPosition(),
51                     nextLexeme.getEndPosition());
52             offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
53             finalOffset = nextLexeme.getEndPosition();
54              // 返会true告知还有下个词元
55              return true;
56         }
57          // 返会false告知词元输出完毕
58          return false;
59     }
60
61      /*
62      * (non-Javadoc)
63      *
64      * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
65       */
66      public void reset() throws IOException {
67          super.reset();
68         _IKImplement.reset(input);
69     }
70
71     @Override
72      public final void end() {
73         offsetAtt.setOffset(finalOffset, finalOffset);
74     }

75 }

1 import java.io.Reader;
2 import java.util.Map;
3
4 import org.apache.lucene.analysis.Tokenizer;
5 import org.apache.lucene.analysis.util.TokenizerFactory;
6 import org.wltea.analyzer.lucene.IKTokenizer;
7
8 /**
9 * 实现Solr4.x分词器接口
10 * 基于IKTokenizer的实现
11 *
12 * @author 林良益、李良杰
13 *
14 */
15 public final class IKTokenizerFactory extends TokenizerFactory{
16
17      private boolean isMaxWordLength = false;
18
19      /**
20      * IK分词器Solr TokenizerFactory接口实现类
21      * 默认最细粒度切分算法
22       */
23      public IKTokenizerFactory(){
24     }
25
26      /*
27      * (non-Javadoc)
28      * @see org.apache.solr.analysis.BaseTokenizerFactory#init(java.util.Map)
29       */
30      public void init(Map<String,String> args){
31         String _arg = args.get("isMaxWordLength");
32         isMaxWordLength = Boolean.parseBoolean(_arg);
33     }
34
35      /*
36      * (non-Javadoc)
37      * @see org.apache.solr.analysis.TokenizerFactory#create(java.io.Reader)
38       */
39      public Tokenizer create(Reader reader) {
40          return new IKTokenizer(reader , isMaxWordLength());
41     }
42
43      public void setMaxWordLength( boolean isMaxWordLength) {
44          this.isMaxWordLength = isMaxWordLength;
45     }
46
47      public boolean isMaxWordLength() {
48          return isMaxWordLength;
49     }
50 }

转自：http://www.cnblogs.com/TerryLiang/archive/2012/10/08/2714918

知识点

相关文章

最近更新

基于Lucene 4.x的ik-analyzer

相关问答

lucene怎么用[2022-07-03]

Lucene和特殊字符(Lucene and Special Characters)[2022-04-08]

Lucene和cypher(Lucene and cypher)[2021-01-03]

Port Lucene 3.6.2分析仪到Lucene 5.5.0(Port Lucene 3.6.2 Analyzer to Lucene 5.5.0)[2023-08-31]

如何使用lucene进行搜索(How to use lucene to search)[2022-12-21]

如何使用Lucene中的MultiFieldQueryParser？(How to use MultiFieldQueryParser from Lucene?)[2024-03-16]

Lucene 3：StandardAnalyzer在哪里？(Lucene 3: Where is StandardAnalyzer?)[2022-08-17]

Lucene得分问题(Problem with Lucene scoring)[2022-08-18]

Apache Lucene 6.2 StandardAnalyzer版本(Apache Lucene 6.2 StandardAnalyzer version)[2022-02-23]

WikipediaTokenizer Lucene(WikipediaTokenizer Lucene)[2022-11-24]