educoder Lucene - 全文检索入门
作者:互联网
第1关:使用lucene创建索引
package step1;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class WriterIndex {
//创建索引库
public static void createIndex() throws IOException{
/********** Begin **********/
//创建索引库
Directory dir = FSDirectory.open(new File("/temp/doc/1101/index"));
//创建标准分析器
Analyzer analyzer = new StandardAnalyzer();
//创建indexwriterConfig对象
//第一个参数:lucene的版本信息,可以选择对应的lucene版本也可以使用LATEST
//第二个参数:分析器对象
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer);
//创建indexwriter对象
IndexWriter index = new IndexWriter(dir,config);
//原始文档的路径
File resource = new File("source/searchsource");
for (File f : resource.listFiles()) {
//文件名
String fileName = f.getName();
//文件内容
String fileContent = FileUtils.readFileToString(f);
//文件路径
String filePath = f.getPath();
//文件大小
long fileSize = FileUtils.sizeOf(f);
//创建文件名域
//第一个参数:域的名称
//第二个参数:域的内容
//第三个参数:是否存储
Field fileNameField = new TextField("filename", fileName, Store.YES);
//文件内容域
Field fileContentField = new TextField("content", fileContent, Store.YES);
//文件路径域(不分析、不索引、只存储)
Field filePathField = new StoredField("path", filePath);
//文件大小域
Field fileSizeField = new LongField("size", fileSize, Store.YES);
//创建document对象
Document document = new Document();
//添加field
document.add(fileNameField);
document.add(fileContentField);
document.add(filePathField);
document.add(fileSizeField);
index.addDocument(document);
}
//关闭indexwriter
index.close();
/********** End **********/
}
}
第2关:查询索引
package step2;
import java.io.File;
import java.io.IOException;
import javax.sql.rowset.serial.SerialArray;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class SearchIndex {
public static void searchIndex() throws IOException{
/********** Begin **********/
Directory directory = FSDirectory.open(new File("/temp/doc/1101/index"));
IndexReader reader = DirectoryReader.open(directory);
//创建indexsearcher对象
IndexSearcher searcher = new IndexSearcher(reader);
//创建查询
Query query = new TermQuery(new Term("content","mybatis"));
//执行查询
//第一个参数是查询对象,第二个参数是查询结果返回的最大值
TopDocs topDocs = searcher.search(query, 10);
//查询结束的总条数
System.out.println("查询结果的总条数:" + topDocs.totalHits);
//遍历查询结果
//topDocs.scoreDocs 存储了document对象的id
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
//sourceDoc.doc属性就是document对象的Id
//根据document的id找到document对象的id
//根据id获取document对象
Document document = searcher.doc(scoreDoc.doc);
//System.out.println(document.get("filename"));
System.out.println(document.get("path"));
System.out.println(document.get("content"));
}
reader.close();
/********** End **********/
}
}
第3关:分词器的使用
package step3;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
public class AnalyzerTest {
public static void main(String[] args) throws IOException {
/********** Begin **********/
//创建一个标准分析器对象
Analyzer analyzer = new CJKAnalyzer();
//获得tokenStream对象
//第一个参数:域名,可以随便给一个
//第二个参数:要分析的文本内容
TokenStream tokenStream = analyzer.tokenStream("test", "我喜欢在Educoder上学习");
//添加一个引用,可以获得每个关键词
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
//添加一个偏移量的引用,记录了关键词的开始位置以及结束位置
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
//将指针调整到列表的头部
tokenStream.reset();
//遍历关键词列表,通过incrementToken方法判断列表是否结束
while(tokenStream.incrementToken()) {
//关键词的起始位置
System.out.println("start->" + offsetAttribute.startOffset());
//取关键词
System.out.println(charTermAttribute);
//结束位置
System.out.println("end->" + offsetAttribute.endOffset());
}
tokenStream.close();
/********** End **********/
}
}
标签:lucene,educoder,new,Lucene,全文检索,org,apache,import,document 来源: https://blog.csdn.net/weixin_43833868/article/details/122740544