Lucene是apache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,即它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现全文检索的功能,或者是以此为基础创建起完整的全文检索引擎。(摘自百度百科)java
操做系统:centos 5.8linux
开发环境:Eclipse 4.3apache
构建工具:Maven 4.0centos
为了可以按照书中的例子进行学习,这里依赖的Lucene版本是3.0.1架构
<dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.0.1</version> </dependency> </dependencies>
完整配置:maven
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.linjl.study.book</groupId> <artifactId>book_luceneInAction</artifactId> <version>0.0.1-SNAPSHOT</version> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.1</version> <configuration> <source /> <target /> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>3.0.1</version> </dependency> </dependencies> </project>
下面将用两个例子进行Lucene入门讲解工具
案例一主要展现经过对指定路径下.txt文件创建索引的过程学习
完整源码:ui
package com.linjl.study.book.luceneInAction.chapter1; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Indexer { private IndexWriter indexWriter; public Indexer(String indexDir) throws IOException { //步骤一:建立 Directory Directory dir = FSDirectory.open(new File(indexDir)); //步骤二:建立 IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer( Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); } public void close() throws CorruptIndexException, IOException { //步骤五:关闭IndexWriter indexWriter.close(); } public int index(String dataDir, FileFilter fileFilter) throws IOException { File[] files = new File(dataDir).listFiles(); for (File file : files) { if (!file.isDirectory() && !file.isHidden() && file.exists() && file.canRead() && (fileFilter == null || fileFilter.accept(file))) { indexFile(file); } } return indexWriter.numDocs(); } private void indexFile(File file) throws IOException { System.out.println("Indexing " + file.getCanonicalPath()); //步骤三:建立Document对象 Document doc = getDocument(file); //步骤四:添加Document indexWriter.addDocument(doc); } protected Document getDocument(File file) throws IOException { Document doc = new Document(); doc.add(new Field("contents", new FileReader(file))); doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("fullpath", file.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); return doc; } private static class TextFilesFilter implements FileFilter { public boolean accept(File pathname) { return pathname.getName().toLowerCase().endsWith(".txt"); } } public static void main(String[] strs) throws IOException { //存放索引的位置(linux环境下路径) String indexDir = "/opt/test/lucene/index"; //存放待索引文件的位置(linux环境下路径) String dataDir = "/opt/test/lucene/files"; long startTime = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long endTime = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (endTime - startTime) + "ms"); } }
案例二展现如何经过对指定的索引文件夹进行关键词索引google
完整源码:
package com.linjl.study.book.luceneInAction.chapter1; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Searcher { public static void search(String indexDir, String searchWord) throws IOException, ParseException { //步骤一:建立Directory Directory dir = FSDirectory.open(new File(indexDir)); //步骤二:建立IndexSearcher IndexSearcher indexSearcher = new IndexSearcher(dir); //步骤三:建立QueryParser QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30)); long startTime = System.currentTimeMillis(); //步骤四:解析生成查询对象 Query query = parser.parse(searchWord); //步骤五:查询并获取查询结果(只是获取到查询结果的引用) TopDocs hits = indexSearcher.search(query, 30); long endTime = System.currentTimeMillis(); System.out.println("Found " + hits.totalHits + "document(s) (in " + (endTime - startTime) + "ms) that matched query '" + searchWord + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { //步骤六:根据引用生成查询结果 Document doc = indexSearcher.doc(scoreDoc.doc); System.out.println(doc.get("fullpath")); } //步骤七:关闭IndexSearcher indexSearcher.close(); } public static void main(String[] args) throws IOException, ParseException { String indexDir = "/opt/test/lucene/index"; String searchWord = "床"; Searcher.search(indexDir, searchWord); }
Directory dir = FSDirectory.open(new File("/tmp/index")); IndexSearcher searcher = new IndexSearcher(dir); Query q = new TermQuery(new Term("contents","lucene")); TopDocs hits = searcher.search(q,10); searcher.close();
本文主要是Lucene In Action 第一章的内容,经过2个例子,对lucene有了最初的认识和使用方法。
(全文完 linjl 20130904 深圳)