最近项目要用文本检索工具,就了解到了lucene. 我对lucene查了不少资料,了解到lucene的版本比较多,且每一个版本里面的方法变更比较大。最后肯定lucene4.7.2版本,理论就很少说了,网上不少介绍理论的,我就直接把能用的代码介绍一下。ide
1.首先创建索引工具
public static void indexBuilding(String indexPath, List<Book> datas){// indexPath表示索引存放的目录 datas表示数据(能够从文本里、表里等取到数据)
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer);
indexWriterConfig.setOpenMode(OpenMode.CREATE);//已建立模式创建索引
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(new File(indexPath)), indexWriterConfig);
for (Book bood: datas) {
indexWriter.addDocument(Document(wotvMediaInfoModel));//写入索引
}
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}测试
}ui
public static Document Document(Book book) {
Document doc = new Document();
doc.add(new StringField("name", book.getName(), Field.Store.YES));//name建立索引,并保存
doc.add(new StringField("title", book.getTitle(), Field.Store.No));//title建立索引,不保存
doc.add(new TextField("name", book.getAuthor(), Field.Store.YES));//author建立索引,保存
return doc;
}this
//数据封装对象.net
public class Book implements Serializable {对象
private static final long serialVersionUID = 1L;
private int id;
private String name;
private String title;
private String author;索引
public data () {}
public void setId(int id) {token
this.id = id;ci
}
public int getId() {
return id;
}
................
................
}
2.创建完索引,就能够进行查询了
public List<QueryResult> query(String keyword) {
try {
Date start = new Date();
IndexSearcher indexSearcher = initIndexSearch();//初始化IndexSearcher
String key = IK_Analyzer(keyword);//IK分词器进行分词
if (StringUtil.isEmpty(key)) {//分词失败,直接返回
return null;
}
List<QueryResult> resultList = queryByOneKey(indexSearcher, "name", key);
if (!CollectionUtil.isEmpty(resultList)) {
Date end = new Date();
logger.info("queryByOneField 耗时:" + (end.getTime() - start.getTime()) + "ms");
for (QueryResult queryResult : resultList) {
logger.info("queryByOneField查询结果:" + queryResult.toString());
}
return resultList;
}
} catch (Exception e) {
logger.error("queryByOneField error", e);
}
return null;
}
//拿到索引,并建立IndexSearcher
protected static IndexSearcher initIndexSearch() {
if (indexSearcher == null) {
DirectoryReader directoryReader = null;
try {
directoryReader = DirectoryReader.open(FSDirectory.open(new File(Lucene.Path.indexFilePath)));
} catch (IOException e) {
logger.error("queryByMultiField error:{}", e);
e.printStackTrace();
return null;
}
indexSearcher = new IndexSearcher(directoryReader);
return indexSearcher;
}
return indexSearcher;
}
//利用IK进行分词
protected static String IK_Analyzer(String str) {
Analyzer ikAnalyzer = new IKAnalyzer();
Reader reader = new StringReader(str);
String results = "";
try {
TokenStream tokenStream = ikAnalyzer.tokenStream("", reader);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffSet = offsetAttribute.startOffset();
int endOffSet = offsetAttribute.endOffset();
if (endOffSet - startOffSet > 1) {
results = results + charTermAttribute.toString() + " ";
}
}
tokenStream.close();
} catch (IOException e) {
e.printStackTrace();
}
logger.info("IK_Analyzer字典分词结果:[" + results + "]");
ikAnalyzer.close();
return results;
}
//查询结果
protected static List<QueryResult> queryByOneKey(IndexSearcher indexSearcher, String field, String keys)
throws ParseException {
try {
Date start = new Date();
QueryParser queryParser = new QueryParser(Version.LUCENE_47, field,
new StandardAnalyzer(Version.LUCENE_47));
queryParser.setDefaultOperator(Operator.OR);//Operator.YES(逻辑与的意思) 和 Operator.OR(逻辑或的意思)
Query query = queryParser.parse(keys);
TopDocs topDocs = indexSearcher.search(query, 3);//取出最匹配的3条记录
Date end = new Date();
logger.info("queryByOneKey 总共为您找到 " + topDocs.totalHits + "条记录\t耗时:" + (end.getTime() - start.getTime())
+ "ms");
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
List<QueryResult> list = new ArrayList<QueryResult>();
for (ScoreDoc scoreDoc : scoreDocs) {
list.add(getIndexResult(indexSearcher.doc(scoreDoc.doc), scoreDoc.score));
}
return list;
} catch (IOException e) {
logger.error("queryByOneKey error:{}", e);
e.printStackTrace();
}
return null;
}
// 组装对象
private static QueryResult getIndexResult(Document doc, float score) {
QueryResult indexResult = new QueryResult();
indexResult.setCid(doc.get("cid"));
indexResult.setRes(doc.get("res"));
indexResult.setName(doc.get("name"));
indexResult.setScore(score);
return indexResult;
}
//结果对象封装
public class QueryResult {
private String name;
private String author;
private float score;
public String getCid() {
return cid;
}
public void setName(String name) {
this.name= name;
}
public String getName() {
return name;
}
public void setTitle(String title) {
this.title = title;
}
public float getTitle() {
return title;
}
public void setScore(float score) {
this.score = score;
}
}
3.利用IK分词器
IK分词器是对中文分词支持比较好的了,而且能够本身扩展字典
自定义字典配置
public class MyConfig implements Configuration {
/*
* 分词器默认字典路径
*/
private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
/*
* 是否使用smart方式分词
*/
private boolean useSmart;
/**
* 返回useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
*
* @return useSmart
*/
public boolean useSmart() {
return useSmart;
}
/**
* 设置useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
*
* @param useSmart
*/
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* 获取主词典路径
*
* @return String 主词典路径
*/
public String getMainDictionary() {
return PATH_DIC_MAIN;
}
/**
* 获取量词词典路径
*
* @return String 量词词典路径
*/
public String getQuantifierDicionary() {
return PATH_DIC_QUANTIFIER;
}
/**
* 获取扩展字典配置路径
*
* @return List<String> 相对类加载器的路径
*/
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = Lucene.IKAnalyzer.myDicPath;//这是本身定义字典目录,由于IK利用的ClassLoader解析的该文件,因此目录最好放到与该方法同级目录下,要否则扩展字典不起做用
extDictFiles.add(extDictCfg);
return extDictFiles;
}
/**
* 获取扩展中止词典配置路径
*/
@Override
public List<String> getExtStopWordDictionarys() {
return null;
}
}
public class InitMyDictionary {
/**
* 将自定义配置添加到字典中
*/
public static void init() {
Dictionary.initial(new MyConfig());
}
}
//测试
public class Test {
public static void main(String[] args) throws Exception {
// 建立索引
//indexBuilding(索引存放的路径,数据);
//初始化扩展字典
InitMyDictionary.init();
//查询
query("水浒传");
}
}
以上就是完成的运行代码, 要想使用就是须要lucene和IK_Analyzer的jar包、而后把本身须要的数据拿到、再就是把上面创建索引和查询的方法本身组织一下、就能够测试使用了。
以上就是我了解的lucene,还有不少功能没了解到,可是目前够项目使用的。