使用 Lucene-Net 實現全文檢索

Lucene.net 是 Lucene 的. net 移植版本,是一個開源的全文檢索引擎開發包,即它不是一個完整的全文檢索引擎,而是一個全文檢索引擎的架構,提供了完整的查詢引擎和索引引擎。

開發人員可以基於 Lucene.net 實現全文檢索的功能。

Lucene.net 是 Apache 軟件基金會贊助的開源項目,基於 Apache License 協議。

Lucene.net 並不是一個爬行搜索引擎,也不會自動地索引內容。我們得先將要索引的文檔中的文本抽取出來,然後再將其加到 Lucene.net 索引中。標準的步驟是先初始化一個 Analyzer、打開一個 IndexWriter、然後再將文檔一個接一個地加進去。一旦完成這些步驟,索引就可以在關閉前得到優化,同時所做的改變也會生效。這個過程可能比開發者習慣的方式更加手工化一些,但卻在數據的索引上給予你更多的靈活性,而且其效率也很高。

獲取索引目錄

    /// <summary>
        /// 獲取索引目錄
        /// </summary>
        /// <param >索引類型</param>
        /// <returns>索引目錄</returns>
        private LcStore.Directory GetLuceneDirectory(IndexType index)
        {
            var indexPath = string.Empty;
            try
            {
                var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

                var indexName = Enum.EnumHelper.GetEnumDescription(index);

                indexPath = Path.Combine(dirPath, indexName);

                return LcStore.FSDirectory.Open(indexPath);
            }
            catch (Exception ex)
            {
                NLogger.Write($"獲取索引目錄失敗" + Environment.NewLine +
                              $"路徑:{indexPath}" + Environment.NewLine +
                              $"異常信息:{ex}",
                             "Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("獲取索引目錄異常,詳情請查看相關日誌");
            }
        }

        #endregion 獲取目錄

盤古分詞

   /// <summary>
        /// 盤古分詞
        /// </summary>
        /// <param >語句</param>
        /// <returns>詞組集合</returns>
        public string[] GetSplitKeywords(string keyword)
        {
            try
            {
                string ret = null;
                var reader = new StringReader(keyword);
                var ts = PanguAnalyzer.TokenStream(keyword, reader);
                var hasNext = ts.IncrementToken();
                Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
                while (hasNext)
                {
                    ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                    ret += ita.Term + "|";
                    hasNext = ts.IncrementToken();
                }
                ts.CloneAttributes();
                reader.Close();
                PanguAnalyzer.Close();

                if (string.IsNullOrWhiteSpace(ret)) return null;

                ret = ret.Substring(0, ret.Length - 1);
                return ret.Split('|');
            }
            catch (Exception ex)
            {
                NLogger.Write("分詞異常" + Environment.NewLine +
                              $"關鍵詞:{keyword}" + Environment.NewLine +
                              $"異常信息:{ex}",
                             "Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("分詞出現異常,詳情請查看相關日誌");
            }
        }

        #endregion 分詞

創建索引或追加索引

     /// <summary>
        /// 創建索引或追加索引
        /// </summary>
        /// <param >數據集合</param>
        /// <param >索引類型</param>
        public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
        {
            if (dataList == null || dataList.Count == 0)
                return;

            IndexWriter writer;
            var directory = GetLuceneDirectory(index);
            try
            {
                //false表示追加(true表示刪除之前的重新寫入)
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
            }
            catch
            {
                //false表示追加(true表示刪除之前的重新寫入)
                writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
            }
            writer.MergeFactor = 1000;
            //writer.SetMaxBufferedDocs(1000);
            foreach (var doc in dataList)
            {
                writer.AddDocument(doc);
            }
            writer.Optimize();

            writer.Dispose();
            directory.Dispose();
        }

完整代碼

 /// <summary>
    /// Lucene搜索引擎幫助類
    /// </summary>
    public class LuceneHelper
    {
        /// <summary>
        /// 私有構造函數
        /// </summary>
        private LuceneHelper()
        {
        }

        #region 屬性

        private static LuceneHelper _instance;

        /// <summary>
        /// 單一實例
        /// </summary>
        public static LuceneHelper Instance => _instance ?? (_instance = new LuceneHelper());

        private Analyzer _analyzer;

        /// <summary>
        /// 分析器
        /// </summary>
        private Analyzer PanguAnalyzer => _analyzer ?? (_analyzer = new PanGuAnalyzer());

        #endregion 屬性

        #region 獲取目錄

        /// <summary>
        /// 獲取索引目錄
        /// </summary>
        /// <param >索引類型</param>
        /// <returns>索引目錄</returns>
        private LcStore.Directory GetLuceneDirectory(IndexType index)
        {
            var indexPath = string.Empty;
            try
            {
                var dirPath = ConfigHelper.GetAppSetting("LuceneIndexPath");

                var indexName = Enum.EnumHelper.GetEnumDescription(index);

                indexPath = Path.Combine(dirPath, indexName);

                return LcStore.FSDirectory.Open(indexPath);
            }
            catch (Exception ex)
            {
                NLogger.Write($"獲取索引目錄失敗" + Environment.NewLine +
                              $"路徑:{indexPath}" + Environment.NewLine +
                              $"異常信息:{ex}",
                             "Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("獲取索引目錄異常,詳情請查看相關日誌");
            }
        }

        #endregion 獲取目錄

        #region 分詞

        /// <summary>
        /// 盤古分詞
        /// </summary>
        /// <param >語句</param>
        /// <returns>詞組集合</returns>
        public string[] GetSplitKeywords(string keyword)
        {
            try
            {
                string ret = null;
                var reader = new StringReader(keyword);
                var ts = PanguAnalyzer.TokenStream(keyword, reader);
                var hasNext = ts.IncrementToken();
                Lucene.Net.Analysis.Tokenattributes.ITermAttribute ita;
                while (hasNext)
                {
                    ita = ts.GetAttribute<Lucene.Net.Analysis.Tokenattributes.ITermAttribute>();
                    ret += ita.Term + "|";
                    hasNext = ts.IncrementToken();
                }
                ts.CloneAttributes();
                reader.Close();
                PanguAnalyzer.Close();

                if (string.IsNullOrWhiteSpace(ret)) return null;

                ret = ret.Substring(0, ret.Length - 1);
                return ret.Split('|');
            }
            catch (Exception ex)
            {
                NLogger.Write("分詞異常" + Environment.NewLine +
                              $"關鍵詞:{keyword}" + Environment.NewLine +
                              $"異常信息:{ex}",
                             "Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("分詞出現異常,詳情請查看相關日誌");
            }
        }

        #endregion 分詞

        #region 索引增刪改查

        /// <summary>
        /// 創建索引或追加索引
        /// </summary>
        /// <param >數據集合</param>
        /// <param >索引類型</param>
        public void CreateOrAppendIndexes(List<Document> dataList, IndexType index)
        {
            if (dataList == null || dataList.Count == 0)
                return;

            IndexWriter writer;
            var directory = GetLuceneDirectory(index);
            try
            {
                //false表示追加(true表示刪除之前的重新寫入)
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
            }
            catch
            {
                //false表示追加(true表示刪除之前的重新寫入)
                writer = new IndexWriter(directory, PanguAnalyzer, true, IndexWriter.MaxFieldLength.LIMITED);
            }
            writer.MergeFactor = 1000;
            //writer.SetMaxBufferedDocs(1000);
            foreach (var doc in dataList)
            {
                writer.AddDocument(doc);
            }
            writer.Optimize();

            writer.Dispose();
            directory.Dispose();
        }

        /// <summary>
        /// 刪除索引
        /// </summary>
        /// <param >字段名</param>
        /// <param >字段值</param>
        /// <param >索引類型</param>
        public void DeleteIndexes(string field, string value, IndexType index)
        {
            IndexWriter writer = null;
            var directory = GetLuceneDirectory(index);
            try
            {
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
                var term = new Term(field, value);
                writer.DeleteDocuments(term);
                //var isSuccess = writer.HasDeletions();
                writer.Optimize();
            }
            catch (Exception ex)
            {
                NLogger.Write("刪除索引異常" + Environment.NewLine +
                              $"異常信息:{ex}""Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("刪除索引異常,詳情請查看相關日誌");
            }
            finally
            {
                writer?.Dispose();
                directory?.Dispose();
            }
        }

        /// <summary>
        /// 更新索引;這裏實際上是先刪除原有索引,在創建新索引。
        /// 所以在更新索引時,一定要確保傳入的Document的所有字段都有值
        /// 否則將會被置爲空
        /// </summary>
        /// <param >字段名</param>
        /// <param >字段值</param>
        /// <param >文檔</param>
        /// <param >索引類型</param>
        public void UpdateIndexes(string field, string value, Document doc, IndexType index)
        {
            IndexWriter writer = null;
            var directory = GetLuceneDirectory(index);
            try
            {
                writer = new IndexWriter(directory, PanguAnalyzer, false, IndexWriter.MaxFieldLength.LIMITED);
                var term = new Term(field, value);
                writer.UpdateDocument(term, doc);
            }
            catch (Exception ex)
            {
                NLogger.Write("更新索引異常" + Environment.NewLine +
                              $"異常信息:{ex}""Lucene""x""x",
                             CustomException.UnknownError, CustomLogLevel.Error);
                throw new Exception("更新索引異常,詳情請查看相關日誌");
            }
            finally
            {
                writer?.Dispose();
                directory?.Dispose();
            }
        }

        #endregion 索引增刪改查

        #region 查詢

        /// <summary>
        /// 查詢
        /// </summary>
        /// <typeparam >實體類型</typeparam>
        /// <param >條件字段</param>
        /// <param >關鍵詞組</param>
        /// <param >索引類型</param>
        /// <param >排序,可爲空</param>
        /// <param >讀取數量</param>
        /// <returns>結果集</returns>
        public List<T> Search<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            Sort sort,
            int count
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目錄
            var directory = GetLuceneDirectory(index);

            //查詢條件
            var boolQuery = GetQuery(fields, keywords);

            //索引查詢器
            var searcher = new IndexSearcher(directory, true);

            TopDocs docs;
            if (sort != null)
                docs = searcher.Search(boolQuery, null, count, sort);
            else
                docs = searcher.Search(boolQuery, count);
            if (docs == null || docs.TotalHits == 0)
                return null;

            //文檔集合
            var docList = docs.ScoreDocs.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射賦值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return list;
        }

        /// <summary>
        /// 查詢分頁數據(指定排序方式)
        /// </summary>
        /// <typeparam >實體類型</typeparam>
        /// <param >條件字段</param>
        /// <param >關鍵詞組</param>
        /// <param >索引類型</param>
        /// <param >排序,必填</param>
        /// <param >頁碼</param>
        /// <param >頁數</param>
        /// <returns>結果集</returns>
        public PagedResult<List<T>> SearchByPaged<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            Sort sort,
            int pageNumber = 1,
            int pageSize = 20
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目錄
            var directory = GetLuceneDirectory(index);

            //查詢條件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopFieldCollector
                .Create(sort, pageNumber * pageSize, false, false, false, false);

            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return null;

            //分頁
            var start = (pageNumber - 1) * pageSize;
            var limit = pageSize;
            var hits = collector.TopDocs(start, limit).ScoreDocs;
            var totalCount = collector.TotalHits;

            var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射賦值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return new PagedResult<List<T>>
            {
                Total = totalCount,
                Result = list
            };
        }

        /// <summary>
        /// 查詢分頁數據(默認排序方式)
        /// </summary>
        /// <typeparam >實體類型</typeparam>
        /// <param >條件字段</param>
        /// <param >關鍵詞組</param>
        /// <param >索引類型</param>
        /// <param >頁碼</param>
        /// <param >頁數</param>
        /// <returns>結果集</returns>
        public PagedResult<List<T>> SearchByPaged<T>
            (
            string[] fields,
            string[] keywords,
            IndexType index,
            int pageNumber = 1,
            int pageSize = 20
            ) where T : new()
        {
            if (fields == null || fields.Length == 0)
                return null;
            if (keywords == null || keywords.Length == 0)
                return null;

            //索引目錄
            var directory = GetLuceneDirectory(index);

            //查詢條件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopScoreDocCollector.Create(pageNumber * pageSize, false);
            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return null;

            //分頁
            var start = (pageNumber - 1) * pageSize;
            var limit = pageSize;
            var hits = collector.TopDocs(start, limit).ScoreDocs;
            var totalCount = collector.TotalHits;

            var docList = hits.Select(sd => searcher.Doc(sd.Doc)).ToList();

            //反射賦值
            var list = ConvertDocToObj<T>(docList);

            searcher.Dispose();
            directory.Dispose();

            return new PagedResult<List<T>>
            {
                Total = totalCount,
                Result = list
            };
        }

        /// <summary>
        /// 查詢分頁數據(默認排序方式)
        /// </summary>
        /// <param >條件字段</param>
        /// <param >關鍵詞組</param>
        /// <param >索引類型</param>
        /// <returns>結果集</returns>
        public int GetTotla(string[] fields, string[] keywords, IndexType index)
        {
            if (fields == null || fields.Length == 0)
                return 0;
            if (keywords == null || keywords.Length == 0)
                return 0;

            //索引目錄
            var directory = GetLuceneDirectory(index);

            //查詢條件
            var boolQuery = GetQuery(fields, keywords);

            var collector = TopScoreDocCollector.Create(20, false);
            var searcher = new IndexSearcher(directory, true);

            searcher.Search(boolQuery, collector);

            if (collector == null || collector.TotalHits == 0)
                return 0;

            searcher.Dispose();
            directory.Dispose();

            return collector.TotalHits;
        }

        /// <summary>
        /// 文檔轉換爲對象
        /// </summary>
        /// <typeparam >實體類型</typeparam>
        /// <param >文檔集合</param>
        /// <returns>對象集合</returns>
        private List<T> ConvertDocToObj<T>(List<Document> docList) where T : new()
        {
            var type = typeof(T);
            var propertyList = type.GetProperties(BindingFlags.Public | BindingFlags.Instance);

            var list = new List<T>();
            var firstDoc = docList.First();
            var fieldNames = firstDoc.GetFields().Select(x => x.Name).ToList();

            foreach (var doc in docList)
            {
                var tObj = new T();
                foreach (var pInfo in propertyList)
                {
                    var name = pInfo.Name;
                    if (fieldNames.Any(x => x.ToLower() == name.ToLower()))
                    {
                        SetValue<T>(pInfo, tObj, doc, name);
                    }
                }

                list.Add(tObj);
            }
            return list;
        }

        /// <summary>
        /// 獲取查詢條件
        /// </summary>
        /// <param >條件字段</param>
        /// <param >關鍵詞組</param>
        /// <returns></returns>
        private BooleanQuery GetQuery(string[] fields, string[] keywords)
        {
            var boolQuery = new BooleanQuery();
            foreach (var field in fields)
            {
                foreach (var keyword in keywords)
                {
                    var t = new TermQuery(new Term(field, keyword));
                    boolQuery.Add(t, Occur.SHOULD);
                }
            }
            return boolQuery;
        }

        #endregion 查詢

        private void SetValue<T>(PropertyInfo pInfo, T tObj, Document doc, string name)
        {
            var pType = pInfo.PropertyType.Name;
            switch (pType)
            {
                case "String":
                    pInfo.SetValue(tObj, doc.Get(name), null);
                    break;

                case "Int32":
                    pInfo.SetValue(tObj, GetInt(doc.Get(name)), null);
                    break;

                case "Boolean":
                    pInfo.SetValue(tObj, GetBool(doc.Get(name)), null);
                    break;

                case "DateTime":
                    pInfo.SetValue(tObj, GetDate(doc.Get(name)), null);
                    break;

                case "Double":
                    pInfo.SetValue(tObj, GetDouble(doc.Get(name)), null);
                    break;

                case "Single":
                    pInfo.SetValue(tObj, GetFloat(doc.Get(name)), null);
                    break;

                case "Decimal":
                    pInfo.SetValue(tObj, GetDecimal(doc.Get(name)), null);
                    break;
            }
        }

        private int GetInt(string value)
        {
            var result = 0;
            int.TryParse(value, out result);
            return result;
        }

        private DateTime GetDate(string value)
        {
            DateTime result;
            DateTime.TryParse(value, out result);
            return result;
        }

        private bool GetBool(string value)
        {
            bool result;
            bool.TryParse(value, out result);
            return result;
        }

        private double GetDouble(string value)
        {
            double result;
            double.TryParse(value, out result);
            return result;
        }

        private float GetFloat(string value)
        {
            float result;
            float.TryParse(value, out result);
            return result;
        }

        private decimal GetDecimal(string value)
        {
            decimal result;
            decimal.TryParse(value, out result);
            return result;
        }
    }
本文由 Readfog 進行 AMP 轉碼,版權歸原作者所有。
來源https://mp.weixin.qq.com/s/W3wArrQBcxsZQvvhSqPOkw