public
void
crateIndex()
throws
Exception {
File indexDir =
new
File(
"D://luceneIndex"
);
//
存储索引文件夹
File dataDir =
new
File(
"D://test"
);
//
须要检索文件夹
Analyzer luceneAnalyzer = new PaodingAnalyzer();
//
PaodingAnalyzer这个类是庖丁解牛中文分词分析器类继承了Lucene的
Analyzer接口,对于检索中文分词有很大帮助
File[] dataFiles = dataDir.listFiles();
boolean
fileIsExist =
false
;
if
(indexDir.listFiles().
length
== 0)
fileIsExist =
true
;
IndexWriter indexWriter =
new
IndexWriter(indexDir,
luceneAnalyzer
, fileIsExist);
//
第三个参数是一个布尔型的变量,若是为
true
的话就表明建立一个新的索引,为
false
的话就表明在原来索引的基础上进行操做。
long
startTime =
new
Date().getTime();
this
.doIndex(dataFiles, indexWriter);
indexWriter.optimize();//优化索引
indexWriter.close();//关闭索引
long
endTime =
new
Date().getTime();
System.
out
.println(
"It takes "
+ (endTime - startTime)
+
" milliseconds to create index for the files in directory "
+ dataDir.getPath());
{color:black}}
* private{*}
void
doIndex(File[] dataFiles, IndexWriter indexWriter)
throws
Exception {
for
(
int
i = 0; i < dataFiles.
length
; i++) {
if
(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(
".html"
)) {//索引全部html格式文件
System.
out
.println(
"Indexing file "
+ dataFiles[i].getCanonicalPath());
Reader txtReader =
new
FileReader(dataFiles[i]);
Document document =
new
Document();
// Field.Store.YES
存储
Field.Store.NO
不存储
// Field.Index.TOKENIZED
分词
Field.Index.UN_TOKENIZED
不分词
document.add(
new
Field(
"path"
, dataFiles[i].getCanonicalPath(), Field.Store.
YES
,
Field.Index.
UN_TOKENIZED
));
document.add(
new
Field(
"filename"
, dataFiles[i].getName(), Field.Store.
YES
, Field.Index.
TOKENIZED
));
//
另一个构造函数
,
接受一个
Reader
对象
document.add(
new
Field(
"contents"
, txtReader));
indexWriter.addDocument(document);
{color:black}}
else
if
(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(
".doc"
)) {//索引全部word文件
FileInputStream in =
new
FileInputStream(dataFiles[i]);//
得到文件流
WordExtractor extractor =
new
WordExtractor(in);//
使用POI对word文件进行解析
String str = extractor.getText();//
返回String
Document document =
new
Document();//生成
Document对象,其中有3个
Field,分别是
path
,
filename,
contents
document.add(
new
Field(
"path"
, dataFiles[i].getCanonicalPath(), Field.Store.
YES
,
Field.Index.
UN_TOKENIZED
));
document.add(
new
Field(
"filename"
, dataFiles[i].getName(), Field.Store.
YES
, Field.Index.
TOKENIZED
));
//
另一个构造函数
,
接受一个
Reader
对象
document.add(
new
Field(
"contents"
, str, Field.Store.
YES
,Field.Index.
TOKENIZED
,
Field.TermVector.
WITH_POSITIONS_OFFSETS
));
indexWriter.addDocument(document);
{color:black}}
else
{
if
(dataFiles[i].isDirectory()) {
doIndex(dataFiles[i].listFiles(), indexWriter);//使用递归,继续索引文件夹
{color:black}}
{color:black}}
{color:black}}
{color:black}}