使用 Lucene-Net 實現全文檢索
Lucene.net 是 Lucene 的. net 移植版本,是一個開源的全文檢索引擎開發包,即它不是一個完整的全文檢索引擎,而是一個全文檢索引擎的架構,提供了完整的查詢引擎和索引引擎。
開發人員可以基於 Lucene.net 實現全文檢索的功能。
Lucene.net 是 Apache 軟件基金會贊助的開源項目,基於 Apache License 協議。
Lucene.net 並不是一個爬行搜索引擎,也不會自動地索引內容。我們得先將要索引的文檔中的文本抽取出來,然後再將其加到 Lucene.net 索引中。標準的步驟是先初始化一個 Analyzer、打開一個 IndexWriter、然後再將文檔一個接一個地加進去。一旦完成這些步驟,索引就可以在關閉前得到優化,同時所做的改變也會生效。這個過程可能比開發者習慣的方式更加手工化一些,但卻在數據的索引上給予你更多的靈活性,而且其效率也很高。
獲取索引目錄
/// <summary>
/// 獲取索引目錄
/// </summary>
/// <param >索引類型</param>
/// <returns>索引目錄</returns>
private LcStore.Directory GetLuceneDirectory(IndexType index)
{
var indexPath = string.Empty;
try
{
var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");
var indexName = Enum.EnumHelper.GetEnumDescription(index);
indexPath = Path.Combine(dirPath, indexName);
return LcStore.FSDirectory.Open(indexPath);
}
catch (Exception ex)
{
NLogger.Write($"獲取索引目錄失敗" + Environment.NewLine +
$"路徑:{indexPath}" + Environment.NewLine +
$"異常信息:{ex}",
"Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("獲取索引目錄異常,詳情請查看相關日誌");
}
}
#endregion 獲取目錄
盤古分詞
/// <summary>
/// 盤古分詞
/// </summary>
/// <param >語句</param>
/// <returns>詞組集合</returns>
public string[] GetSplitKeywords(string keyword)
{
try
{
string ret = null;
var reader = new StringReader(keyword);
var ts = PanguAnalyzer.TokenStream(keyword, reader);
var hasNext = ts.IncrementToken();
Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
while (hasNext)
{
ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
ret += ita.Term + "|";
hasNext = ts.IncrementToken();
}
ts.CloneAttributes();
reader.Close();
PanguAnalyzer.Close();
if (string.IsNullOrWhiteSpace(ret)) return null;
ret = ret.Substring(0, ret.Length - 1);
return ret.Split('|');
}
catch (Exception ex)
{
NLogger.Write("分詞異常" + Environment.NewLine +
$"關鍵詞:{keyword}" + Environment.NewLine +
$"異常信息:{ex}",
"Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("分詞出現異常,詳情請查看相關日誌");
}
}
#endregion 分詞
創建索引或追加索引
/// <summary>
/// 創建索引或追加索引
/// </summary>
/// <param >數據集合</param>
/// <param >索引類型</param>
public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
{
if (dataList == null || dataList.Count == 0)
return;
IndexWriter writer;
var directory = GetLuceneDirectory(index);
try
{
//false表示追加(true表示刪除之前的重新寫入)
writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
}
catch
{
//false表示追加(true表示刪除之前的重新寫入)
writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
}
writer.MergeFactor = 1000;
//writer.SetMaxBufferedDocs(1000);
foreach (var doc in dataList)
{
writer.AddDocument(doc);
}
writer.Optimize();
writer.Dispose();
directory.Dispose();
}
完整代碼
/// <summary>
/// Lucene搜索引擎幫助類
/// </summary>
public class LuceneHelper
{
/// <summary>
/// 私有構造函數
/// </summary>
private LuceneHelper()
{
}
#region 屬性
private static LuceneHelper _instance;
/// <summary>
/// 單一實例
/// </summary>
public static LuceneHelper Instance => _instance ?? (_instance = new LuceneHelper());
private Analyzer _analyzer;
/// <summary>
/// 分析器
/// </summary>
private Analyzer PanguAnalyzer => _analyzer ?? (_analyzer = new PanGuAnalyzer());
#endregion 屬性
#region 獲取目錄
/// <summary>
/// 獲取索引目錄
/// </summary>
/// <param >索引類型</param>
/// <returns>索引目錄</returns>
private LcStore.Directory GetLuceneDirectory(IndexType index)
{
var indexPath = string.Empty;
try
{
var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");
var indexName = Enum.EnumHelper.GetEnumDescription(index);
indexPath = Path.Combine(dirPath, indexName);
return LcStore.FSDirectory.Open(indexPath);
}
catch (Exception ex)
{
NLogger.Write($"獲取索引目錄失敗" + Environment.NewLine +
$"路徑:{indexPath}" + Environment.NewLine +
$"異常信息:{ex}",
"Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("獲取索引目錄異常,詳情請查看相關日誌");
}
}
#endregion 獲取目錄
#region 分詞
/// <summary>
/// 盤古分詞
/// </summary>
/// <param >語句</param>
/// <returns>詞組集合</returns>
public string[] GetSplitKeywords(string keyword)
{
try
{
string ret = null;
var reader = new StringReader(keyword);
var ts = PanguAnalyzer.TokenStream(keyword, reader);
var hasNext = ts.IncrementToken();
Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
while (hasNext)
{
ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
ret += ita.Term + "|";
hasNext = ts.IncrementToken();
}
ts.CloneAttributes();
reader.Close();
PanguAnalyzer.Close();
if (string.IsNullOrWhiteSpace(ret)) return null;
ret = ret.Substring(0, ret.Length - 1);
return ret.Split('|');
}
catch (Exception ex)
{
NLogger.Write("分詞異常" + Environment.NewLine +
$"關鍵詞:{keyword}" + Environment.NewLine +
$"異常信息:{ex}",
"Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("分詞出現異常,詳情請查看相關日誌");
}
}
#endregion 分詞
#region 索引增刪改查
/// <summary>
/// 創建索引或追加索引
/// </summary>
/// <param >數據集合</param>
/// <param >索引類型</param>
public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
{
if (dataList == null || dataList.Count == 0)
return;
IndexWriter writer;
var directory = GetLuceneDirectory(index);
try
{
//false表示追加(true表示刪除之前的重新寫入)
writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
}
catch
{
//false表示追加(true表示刪除之前的重新寫入)
writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
}
writer.MergeFactor = 1000;
//writer.SetMaxBufferedDocs(1000);
foreach (var doc in dataList)
{
writer.AddDocument(doc);
}
writer.Optimize();
writer.Dispose();
directory.Dispose();
}
/// <summary>
/// 刪除索引
/// </summary>
/// <param >字段名</param>
/// <param >字段值</param>
/// <param >索引類型</param>
public void DeleteIndexes(string field, string value, IndexType index)
{
IndexWriter writer = null;
var directory = GetLuceneDirectory(index);
try
{
writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
var term = new Term(field, value);
writer.DeleteDocuments(term);
//var isSuccess = writer.HasDeletions();
writer.Optimize();
}
catch (Exception ex)
{
NLogger.Write("刪除索引異常" + Environment.NewLine +
$"異常信息:{ex}", "Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("刪除索引異常,詳情請查看相關日誌");
}
finally
{
writer?.Dispose();
directory?.Dispose();
}
}
/// <summary>
/// 更新索引;這裏實際上是先刪除原有索引,在創建新索引。
/// 所以在更新索引時,一定要確保傳入的Document的所有字段都有值
/// 否則將會被置爲空
/// </summary>
/// <param >字段名</param>
/// <param >字段值</param>
/// <param >文檔</param>
/// <param >索引類型</param>
public void UpdateIndexes(string field, string value, Document doc, IndexType index)
{
IndexWriter writer = null;
var directory = GetLuceneDirectory(index);
try
{
writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
var term = new Term(field, value);
writer.UpdateDocument(term, doc);
}
catch (Exception ex)
{
NLogger.Write("更新索引異常" + Environment.NewLine +
$"異常信息:{ex}", "Lucene", "x", "x",
CustomException.UnknownError, CustomLogLevel.Error);
throw new Exception("更新索引異常,詳情請查看相關日誌");
}
finally
{
writer?.Dispose();
directory?.Dispose();
}
}
#endregion 索引增刪改查
#region 查詢
/// <summary>
/// 查詢
/// </summary>
/// <typeparam >實體類型</typeparam>
/// <param >條件字段</param>
/// <param >關鍵詞組</param>
/// <param >索引類型</param>
/// <param >排序,可爲空</param>
/// <param >讀取數量</param>
/// <returns>結果集</returns>
public List<T> Search<T>
(
string[] fields,
string[] keywords,
IndexType index,
Sort sort,
int count
) where T : new()
{
if (fields == null || fields.Length == 0)
return null;
if (keywords == null || keywords.Length == 0)
return null;
//索引目錄
var directory = GetLuceneDirectory(index);
//查詢條件
var boolQuery = GetQuery(fields, keywords);
//索引查詢器
var searcher = new IndexSearcher(directory, true);
TopDocs docs;
if (sort != null)
docs = searcher.Search(boolQuery, null, count, sort);
else
docs = searcher.Search(boolQuery, count);
if (docs == null || docs.TotalHits == 0)
return null;
//文檔集合
var docList = docs.ScoreDocs.Select(sd => searcher.Doc(sd.Doc)).ToList();
//反射賦值
var list = ConvertDocToObj<T>(docList);
searcher.Dispose();
directory.Dispose();
return list;
}
/// <summary>
/// 查詢分頁數據(指定排序方式)
/// </summary>
/// <typeparam >實體類型</typeparam>
/// <param >條件字段</param>
/// <param >關鍵詞組</param>
/// <param >索引類型</param>
/// <param >排序,必填</param>
/// <param >頁碼</param>
/// <param >頁數</param>
/// <returns>結果集</returns>
public PagedResult<List<T>> SearchByPaged<T>
(
string[] fields,
string[] keywords,
IndexType index,
Sort sort,
int pageNumber = 1,
int pageSize = 20
) where T : new()
{
if (fields == null || fields.Length == 0)
return null;
if (keywords == null || keywords.Length == 0)
return null;
//索引目錄
var directory = GetLuceneDirectory(index);
//查詢條件
var boolQuery = GetQuery(fields, keywords);
var collector = TopFieldCollector
.Create(sort, pageNumber * pageSize, false, false, false, false);
var searcher = new IndexSearcher(directory, true);
searcher.Search(boolQuery, collector);
if (collector == null || collector.TotalHits == 0)
return null;
//分頁
var start = (pageNumber - 1) * pageSize;
var limit = pageSize;
var hits = collector.TopDocs(start, limit).ScoreDocs;
var totalCount = collector.TotalHits;
var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();
//反射賦值
var list = ConvertDocToObj<T>(docList);
searcher.Dispose();
directory.Dispose();
return new PagedResult<List<T>>
{
Total = totalCount,
Result = list
};
}
/// <summary>
/// 查詢分頁數據(默認排序方式)
/// </summary>
/// <typeparam >實體類型</typeparam>
/// <param >條件字段</param>
/// <param >關鍵詞組</param>
/// <param >索引類型</param>
/// <param >頁碼</param>
/// <param >頁數</param>
/// <returns>結果集</returns>
public PagedResult<List<T>> SearchByPaged<T>
(
string[] fields,
string[] keywords,
IndexType index,
int pageNumber = 1,
int pageSize = 20
) where T : new()
{
if (fields == null || fields.Length == 0)
return null;
if (keywords == null || keywords.Length == 0)
return null;
//索引目錄
var directory = GetLuceneDirectory(index);
//查詢條件
var boolQuery = GetQuery(fields, keywords);
var collector = TopScoreDocCollector.Create(pageNumber * pageSize, false);
var searcher = new IndexSearcher(directory, true);
searcher.Search(boolQuery, collector);
if (collector == null || collector.TotalHits == 0)
return null;
//分頁
var start = (pageNumber - 1) * pageSize;
var limit = pageSize;
var hits = collector.TopDocs(start, limit).ScoreDocs;
var totalCount = collector.TotalHits;
var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();
//反射賦值
var list = ConvertDocToObj<T>(docList);
searcher.Dispose();
directory.Dispose();
return new PagedResult<List<T>>
{
Total = totalCount,
Result = list
};
}
/// <summary>
/// 查詢分頁數據(默認排序方式)
/// </summary>
/// <param >條件字段</param>
/// <param >關鍵詞組</param>
/// <param >索引類型</param>
/// <returns>結果集</returns>
public int GetTotla(string[] fields, string[] keywords, IndexType index)
{
if (fields == null || fields.Length == 0)
return 0;
if (keywords == null || keywords.Length == 0)
return 0;
//索引目錄
var directory = GetLuceneDirectory(index);
//查詢條件
var boolQuery = GetQuery(fields, keywords);
var collector = TopScoreDocCollector.Create(20, false);
var searcher = new IndexSearcher(directory, true);
searcher.Search(boolQuery, collector);
if (collector == null || collector.TotalHits == 0)
return 0;
searcher.Dispose();
directory.Dispose();
return collector.TotalHits;
}
/// <summary>
/// 文檔轉換爲對象
/// </summary>
/// <typeparam >實體類型</typeparam>
/// <param >文檔集合</param>
/// <returns>對象集合</returns>
private List<T> ConvertDocToObj<T>(List<Document> docList) where T : new()
{
var type = typeof(T);
var propertyList = type.GetProperties(BindingFlags.Public | BindingFlags.Instance);
var list = new List<T>();
var firstDoc = docList.First();
var fieldNames = firstDoc.GetFields().Select(x => x.Name).ToList();
foreach (var doc in docList)
{
var tObj = new T();
foreach (var pInfo in propertyList)
{
var name = pInfo.Name;
if (fieldNames.Any(x => x.ToLower() == name.ToLower()))
{
SetValue<T>(pInfo, tObj, doc, name);
}
}
list.Add(tObj);
}
return list;
}
/// <summary>
/// 獲取查詢條件
/// </summary>
/// <param >條件字段</param>
/// <param >關鍵詞組</param>
/// <returns></returns>
private BooleanQuery GetQuery(string[] fields, string[] keywords)
{
var boolQuery = new BooleanQuery();
foreach (var field in fields)
{
foreach (var keyword in keywords)
{
var t = new TermQuery(new Term(field, keyword));
boolQuery.Add(t, Occur.SHOULD);
}
}
return boolQuery;
}
#endregion 查詢
private void SetValue<T>(PropertyInfo pInfo, T tObj, Document doc, string name)
{
var pType = pInfo.PropertyType.Name;
switch (pType)
{
case "String":
pInfo.SetValue(tObj, doc.Get(name), null);
break;
case "Int32":
pInfo.SetValue(tObj, GetInt(doc.Get(name)), null);
break;
case "Boolean":
pInfo.SetValue(tObj, GetBool(doc.Get(name)), null);
break;
case "DateTime":
pInfo.SetValue(tObj, GetDate(doc.Get(name)), null);
break;
case "Double":
pInfo.SetValue(tObj, GetDouble(doc.Get(name)), null);
break;
case "Single":
pInfo.SetValue(tObj, GetFloat(doc.Get(name)), null);
break;
case "Decimal":
pInfo.SetValue(tObj, GetDecimal(doc.Get(name)), null);
break;
}
}
private int GetInt(string value)
{
var result = 0;
int.TryParse(value, out result);
return result;
}
private DateTime GetDate(string value)
{
DateTime result;
DateTime.TryParse(value, out result);
return result;
}
private bool GetBool(string value)
{
bool result;
bool.TryParse(value, out result);
return result;
}
private double GetDouble(string value)
{
double result;
double.TryParse(value, out result);
return result;
}
private float GetFloat(string value)
{
float result;
float.TryParse(value, out result);
return result;
}
private decimal GetDecimal(string value)
{
decimal result;
decimal.TryParse(value, out result);
return result;
}
}
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源:https://mp.weixin.qq.com/s/W3wArrQBcxsZQvvhSqPOkw