Lucene5学习之自定义同义词分词器简单示例

同义词功能在全文搜索时的意义,大家应该都懂的。今天中文我就试着写了一个同义词分词的示例demo,其实主要代码还是参考Lucene in Action 这本英文版书籍的随书代码,只不过Lucenen in Action书里的示例代码目前最新版只支持到Lucene4.x,对于Lucene5.x,代码需要稍作修改,下面是基于Lucene5.x的自定义同义词分词器demo:

12717355_10156599378770341_5246585988498113837_n

package com.yida.framework.lucene5.analyzer.synonym;

import java.io.IOException;
/**
 * 同义词提取引擎
 * @author Lanxiaowei
 *
 */
public interface SynonymEngine {
    String[] getSynonyms(String s) throws IOException;
}
package com.yida.framework.lucene5.analyzer.synonym;

import java.io.IOException;
import java.util.HashMap;

public class BaseSynonymEngine implements SynonymEngine {
    private static HashMap<String, String[]> map = new HashMap<String, String[]>();
    
    {
        map.put("quick", new String[] {"fast","speedy"});
        map.put("jumps", new String[] {"leaps","hops"});
        map.put("over", new String[] {"above"});
        map.put("lazy", new String[] {"apathetic","slugish"});
        map.put("dog", new String[] {"canine","pooch"});
    }

    public String[] getSynonyms(String s) throws IOException {
        return map.get(s);
    }
}
package com.yida.framework.lucene5.analyzer.synonym;

import java.io.IOException;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

/**
 * 自定义同义词过滤器
 * 
 * @author Lanxiaowei
 * 
 */
public class SynonymFilter extends TokenFilter {
    public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";

    private Stack synonymStack;
    private SynonymEngine engine;
    private AttributeSource.State current;

    private final CharTermAttribute termAtt;
    private final PositionIncrementAttribute posIncrAtt;

    public SynonymFilter(TokenStream in, SynonymEngine engine) {
        super(in);
        synonymStack = new Stack(); // #1
        this.engine = engine;

        this.termAtt = addAttribute(CharTermAttribute.class);
        this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    }

    public boolean incrementToken() throws IOException {
        if (synonymStack.size() > 0) { // #2
            String syn = synonymStack.pop(); // #2
            restoreState(current); // #2
            // 这里Lucene4.x的写法
            // termAtt.setTermBuffer(syn);

            // 这是Lucene5.x的写法
            termAtt.copyBuffer(syn.toCharArray(), 0, syn.length());
            posIncrAtt.setPositionIncrement(0); // #3
            return true;
        }

        if (!input.incrementToken()) // #4
            return false;

        if (addAliasesToStack()) { // #5
            current = captureState(); // #6
        }

        return true; // #7
    }

    private boolean addAliasesToStack() throws IOException {
        // 这里Lucene4.x的写法
        // String[] synonyms = engine.getSynonyms(termAtt.term()); //#8

        // 这里Lucene5.x的写法
        String[] synonyms = engine.getSynonyms(termAtt.toString()); // #8

        if (synonyms == null) {
            return false;
        }
        for (String synonym : synonyms) { // #9
            synonymStack.push(synonym);
        }
        return true;
    }
}

/*
#1 Define synonym buffer
#2 Pop buffered synonyms
#3 Set position increment to 0
#4 Read next token
#5 Push synonyms onto stack
#6 Save current token
#7 Return current token
#8 Retrieve synonyms
#9 Push synonyms onto stack
*/

package com.yida.framework.lucene5.analyzer.synonym;

import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import com.yida.framework.lucene5.util.analyzer.codec.MetaphoneReplacementFilter;

/**
 * 自定义同义词分词器
 * 
 * @author Lanxiaowei
 * @createTime 2015-03-31 10:15:23
 */
public class SynonymAnalyzer extends Analyzer {

    private SynonymEngine engine;

    public SynonymAnalyzer(SynonymEngine engine) {
        this.engine = engine;
    }

    @Override
    protected TokenStreamComponents createComponents(String text) {
        Tokenizer tokenizer = new StandardTokenizer();
        TokenStream tokenStream = new SynonymFilter(tokenizer, engine);
        tokenStream = new LowerCaseFilter(tokenStream);
        tokenStream = new StopFilter(tokenStream,StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        return new TokenStreamComponents(tokenizer, tokenStream);
    }
}

package com.yida.framework.lucene5.analyzer.synonym;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import com.yida.framework.lucene5.util.AnalyzerUtils;

public class SynonymAnalyzerTest {
    public static void main(String[] args) throws IOException {
        String text = "The quick brown fox jumps over the lazy dog";
        Analyzer analyzer = new SynonymAnalyzer(new BaseSynonymEngine());
        AnalyzerUtils.displayTokens(analyzer, text);
    }
}
package com.yida.framework.lucene5.util;

import java.io.IOException;

import junit.framework.Assert;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

/**
 * 用于分词器测试的一个简单工具类(用于打印分词情况,包括Term的起始位置和结束位置(即所谓的偏 * 移量),位置增量,Term字符串,Term字符串类型(字符串/阿拉伯数字之类的))
 * @author Lanxiaowei
 *
 */
public class AnalyzerUtils {
    public static void displayTokens(Analyzer analyzer,String text) throws IOException {
        TokenStream tokenStream = analyzer.tokenStream("text", text);
        displayTokens(tokenStream);
    }
    
    public static void displayTokens(TokenStream tokenStream) throws IOException {
        OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
        
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            int increment = positionIncrementAttribute.getPositionIncrement();
            if(increment > 0) {
                position = position + increment;
                System.out.print(position + ":");
            }
            int startOffset = offsetAttribute.startOffset();
            int endOffset = offsetAttribute.endOffset();
            String term = charTermAttribute.toString();
            System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());
        }
    }
    
    /**
     * 断言分词结果
     * @param analyzer
     * @param text        源字符串
     * @param expecteds   期望分词后结果
     * @throws IOException 
     */
    public static void assertAnalyzerTo(Analyzer analyzer,String text,String[] expecteds) throws IOException {
        TokenStream tokenStream = analyzer.tokenStream("text", text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        for(String expected : expecteds) {
            Assert.assertTrue(tokenStream.incrementToken());
            Assert.assertEquals(expected, charTermAttribute.toString());
        }
        Assert.assertFalse(tokenStream.incrementToken());
        tokenStream.close();
    }
}

以上代码都是Lucene in Action这本书里面的示例代码,我只不过是基于Lucene5.x把它重写并调试成功了,特此分享,希望对正在学习Lucene5的童鞋们有所帮助。

转载请注明出处:代码说 » Lucene5学习之自定义同义词分词器简单示例


关注微信公众号

码中人 微信公众号