Solr插件开发

场景介绍:java

在处理输入的文本时,须要将http://bit.ly/3ynriE等短链接转换为真实链接lucene.apache.org/solr正则表达式


1,实现TokenFilterapache

package com.url.plugin;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.util.regex.Pattern;


public class ResolveUrlTokenFilter extends TokenFilter {

    private final CharTermAttribute charTermAttribute=addAttribute(CharTermAttribute.class);
    private final Pattern patternToMatchShortenedUrls;

    public ResolveUrlTokenFilter(TokenStream input, Pattern patternToMatchShortenedUrls) {
        super(input);
        this.patternToMatchShortenedUrls = patternToMatchShortenedUrls;
    }

    @Override
    public boolean incrementToken() throws IOException {
        if (!input.incrementToken())
            return false;

        //charTermAttribute会保存读取char
        char[] term=charTermAttribute.buffer();
        int len=term.length;
        //构造字符串
        String token=new String(term,0,len);
        //匹配token中是否出现咱们须要重构的场景
        if(patternToMatchShortenedUrls.matcher(token).matches()){
            charTermAttribute.setEmpty().append(resolveUrlToken(token));
        }

        return true;
    }

    private String resolveUrlToken(String token) {
        //TODO 根据实际需求处理token
        try {
            if ("http://bit.ly/3ynriE".equals(token)) {
                return "lucene.apache.org/solr";
            } else if ("http://bit.ly/15tzw".equals(token)) {
                return "manning.com";
            }
        } catch (Exception exc) {
            // rather than failing analysis if you can't resolve the URL,
            // you should log the error and return the un-resolved value
            exc.printStackTrace();
        }
        return token;
    }
}

2,实现TokenFilterFactoryapp

package com.url.plugin;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;

import java.util.Map;
import java.util.regex.Pattern;


public class ResolveUrlTokenFilterFactory extends TokenFilterFactory {

    private Pattern patternToMatchShortenedUrls;

    public ResolveUrlTokenFilterFactory(Map<String, String> args) {
        super(args);
        assureMatchVersion();
        //从solr读取的配置文件信息中获取正则表达式信息
        String shortenedUrls=require(args,"shortenedUrlPattern");
        patternToMatchShortenedUrls=Pattern.compile(shortenedUrls);
    }

    @Override
    public TokenFilter create(TokenStream tokenStream) {
        //建立ResolveUrlTokenFilter实例对象
        return new ResolveUrlTokenFilter(tokenStream,patternToMatchShortenedUrls);
    }
}

3,将其打成jar包ide

4,在solr的schema文件中添加以下内容
ui

<fieldType name="text_plugin" class="solr.TextField" positionIncrementGap="100">
	<analyzer type="index">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="com.url.plugin.ResolveUrlTokenFilterFactory" shortenedUrlPattern="http:\/\/bit.ly\/[\w\-]+" />
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
	<analyzer type="query">
	<tokenizer class="solr.StandardTokenizerFactory"/>
	<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
	<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
	<filter class="solr.LowerCaseFilterFactory"/>
	</analyzer>
</fieldType>

5,在solr的根目录下建立plugin文件夹,(位置同dist,contrib文件),并将3生成的jar放入其中this

6,在solrconfg.xml中添加url

<lib dir="../../../plugins/" regex=".*\.jar" />  code

7,java -jar start.jarxml

相关文章
相关标签/搜索