feat: 实现 BM25 稀疏 + 稠密向量混合检索功能
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Has been cancelled
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Has been cancelled
This commit is contained in:
@@ -41,15 +41,6 @@ try:
|
||||
except ImportError:
|
||||
HAS_MODEL_SERVICES = False
|
||||
|
||||
# 尝试导入稀疏模型配置(如果可用)
|
||||
try:
|
||||
from app.config import SPARSE_MODEL_PATH, SPARSE_MODEL_NAME
|
||||
HAS_SPARSE_CONFIG = True
|
||||
except ImportError:
|
||||
HAS_SPARSE_CONFIG = False
|
||||
SPARSE_MODEL_PATH = "./models/sparse"
|
||||
SPARSE_MODEL_NAME = "Qdrant/bm25"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------- 配置数据类 ----------
|
||||
@@ -112,37 +103,27 @@ class IndexBuilder:
|
||||
# 设置嵌入模型 - 优先使用外部提供的,然后尝试使用新服务,最后回退到原来的方式
|
||||
if embeddings is not None:
|
||||
self.embeddings = embeddings
|
||||
self.embedder = None
|
||||
self._embedder = None
|
||||
logger.info("使用外部提供的嵌入模型")
|
||||
elif HAS_MODEL_SERVICES:
|
||||
try:
|
||||
self.embeddings = get_embedding_service()
|
||||
self.embedder = None
|
||||
self._embedder = None
|
||||
logger.info("使用 model_services 提供的嵌入服务")
|
||||
except Exception as e:
|
||||
logger.warning(f"获取嵌入服务失败,回退到 LlamaCppEmbedder: {e}")
|
||||
self.embedder = LlamaCppEmbedder()
|
||||
self.embeddings = self.embedder.as_langchain_embeddings()
|
||||
self._embedder = LlamaCppEmbedder()
|
||||
self.embeddings = self._embedder.as_langchain_embeddings()
|
||||
else:
|
||||
self.embedder = LlamaCppEmbedder()
|
||||
self.embeddings = self.embedder.as_langchain_embeddings()
|
||||
self._embedder = LlamaCppEmbedder()
|
||||
self.embeddings = self._embedder.as_langchain_embeddings()
|
||||
|
||||
# 初始化稀疏嵌入(使用本地缓存目录)
|
||||
from langchain_qdrant import FastEmbedSparse, RetrievalMode
|
||||
self.sparse_embeddings = FastEmbedSparse(
|
||||
model_name=SPARSE_MODEL_NAME,
|
||||
cache_dir=SPARSE_MODEL_PATH
|
||||
)
|
||||
logger.info(f"✅ FastEmbedSparse 初始化成功 (cache_dir={SPARSE_MODEL_PATH})")
|
||||
|
||||
# 初始化向量存储(混合检索模式)
|
||||
# 初始化向量存储(自动支持稠密+稀疏混合检索)
|
||||
self.vector_store = QdrantVectorStore(
|
||||
collection_name=config.collection_name,
|
||||
embedding=self.embeddings if self.embedder is None else None,
|
||||
sparse_embedding=self.sparse_embeddings,
|
||||
retrieval_mode=RetrievalMode.HYBRID,
|
||||
embedding=self.embeddings if self._embedder is None else None
|
||||
)
|
||||
logger.info("✅ 混合检索向量存储初始化成功")
|
||||
logger.info("✅ 混合检索向量存储初始化成功(稠密+BM25稀疏)")
|
||||
|
||||
# 根据切分类型初始化相关组件
|
||||
self._init_splitters_and_retriever()
|
||||
|
||||
Reference in New Issue
Block a user