feat: 实现 BM25 稀疏 + 稠密向量混合检索功能
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Has been cancelled
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Has been cancelled
This commit is contained in:
34
backend/rag_core/sparse_embedder.py
Normal file
34
backend/rag_core/sparse_embedder.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
BM25 稀疏嵌入器
|
||||
基于 FastEmbed 的 Qdrant/bm25 模型,完全离线运行
|
||||
"""
|
||||
from typing import List
|
||||
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
|
||||
from app.config import FASTEMBED_CACHE_PATH
|
||||
|
||||
class BM25SparseEmbedder:
|
||||
"""BM25 稀疏嵌入包装器,与现有嵌入器风格统一"""
|
||||
|
||||
def __init__(self):
|
||||
self.model = SparseTextEmbedding(
|
||||
model_name="Qdrant/bm25",
|
||||
cache_dir=FASTEMBED_CACHE_PATH,
|
||||
local_files_only=True, # 强制离线,永不联网
|
||||
)
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[dict]:
|
||||
"""返回稀疏向量列表,每个为 Qdrant 兼容的 dict(indices+values)"""
|
||||
return [vec.as_object() for vec in self.model.embed(texts)]
|
||||
|
||||
def embed_query(self, text: str) -> dict:
|
||||
"""返回单个稀疏向量"""
|
||||
return list(self.model.embed([text]))[0].as_object()
|
||||
|
||||
# 全局单例
|
||||
_sparse_embedder_instance = None
|
||||
|
||||
def get_sparse_embedder() -> BM25SparseEmbedder:
|
||||
global _sparse_embedder_instance
|
||||
if _sparse_embedder_instance is None:
|
||||
_sparse_embedder_instance = BM25SparseEmbedder()
|
||||
return _sparse_embedder_instance
|
||||
Reference in New Issue
Block a user