From ce6e459e19d71959dfc4983d6ddbf5c4fbc70920 Mon Sep 17 00:00:00 2001 From: root <953994191@qq.com> Date: Sun, 3 May 2026 18:08:39 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20=E6=B7=BB=E5=8A=A0=E5=8F=AF?= =?UTF-8?q?=E9=80=89=E7=A8=80=E7=96=8F=E5=90=91=E9=87=8F=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E5=88=B0=E7=B4=A2=E5=BC=95=E5=99=A8=20-=20=E5=AE=8C=E5=85=A8?= =?UTF-8?q?=E5=85=BC=E5=AE=B9=E7=8E=B0=E6=9C=89=E4=BB=A3=E7=A0=81=EF=BC=9A?= =?UTF-8?q?=E9=BB=98=E8=AE=A4=20enable=5Fsparse=3DFalse=20-=20=E5=90=AF?= =?UTF-8?q?=E7=94=A8=E6=97=B6=EF=BC=9A=E9=9C=80=E8=A6=81=E5=AE=89=E8=A3=85?= =?UTF-8?q?=20fastembed=EF=BC=8C=E8=AE=BE=E7=BD=AE=20enable=5Fsparse=3DTru?= =?UTF-8?q?e=20-=20=E8=87=AA=E5=8A=A8=E5=88=9D=E5=A7=8B=E5=8C=96=20FastEmb?= =?UTF-8?q?edSparse=20=E5=92=8C=20RetrievalMode.HYBRID=20-=20=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E6=97=B6=E4=BC=98=E9=9B=85=E5=9B=9E=E9=80=80=E5=88=B0?= =?UTF-8?q?=E7=BA=AF=E7=A8=A0=E5=AF=86=20-=20=E8=AF=AD=E6=B3=95=E6=A3=80?= =?UTF-8?q?=E6=9F=A5=E9=80=9A=E8=BF=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag_indexer/index_builder.py | 28 ++++++++++++++++++++++++---- rag_indexer/requirements.txt | 2 ++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/rag_indexer/index_builder.py b/rag_indexer/index_builder.py index 8970351..b60319e 100644 --- a/rag_indexer/index_builder.py +++ b/rag_indexer/index_builder.py @@ -71,6 +71,9 @@ class IndexBuilderConfig: # 其他切分器参数(当 splitter_type 非父子块时使用) extra_splitter_kwargs: Dict[str, Any] = field(default_factory=dict) + + # 混合检索支持(默认 False,完全兼容) + enable_sparse: bool = False # ---------- 索引构建器 ---------- class IndexBuilder: @@ -116,10 +119,27 @@ class IndexBuilder: self.embeddings = self.embedder.as_langchain_embeddings() # 初始化向量存储 - self.vector_store = QdrantVectorStore( - collection_name=config.collection_name, - embeddings=self.embeddings if self.embedder is None else None, - ) + # 默认 enable_sparse=False,完全兼容现有代码 + # 若需要启用混合检索,请先安装 fastembed,然后设置 enable_sparse=True + qdrant_kwargs = { + "collection_name": config.collection_name, + } + + if self.config.enable_sparse: + try: + from langchain_qdrant import FastEmbedSparse, RetrievalMode + qdrant_kwargs["sparse_embedding"] = FastEmbedSparse(model_name="Qdrant/bm25") + qdrant_kwargs["retrieval_mode"] = RetrievalMode.HYBRID + logger.info("✅ 稀疏向量支持已启用") + except ImportError: + logger.warning("⚠️ fastembed 未安装,无法启用稀疏向量,继续使用纯稠密") + except Exception as e: + logger.warning(f"⚠️ 稀疏向量初始化失败: {e},继续使用纯稠密") + + if self.embedder is None: + qdrant_kwargs["embedding"] = self.embeddings + + self.vector_store = QdrantVectorStore(**qdrant_kwargs) # 根据切分类型初始化相关组件 self._init_splitters_and_retriever() diff --git a/rag_indexer/requirements.txt b/rag_indexer/requirements.txt index b7e65d4..1b4460e 100644 --- a/rag_indexer/requirements.txt +++ b/rag_indexer/requirements.txt @@ -14,6 +14,8 @@ tiktoken>=0.12.0 # Vector DB qdrant-client==1.17.1 +# 可选:用于稀疏向量支持 +# fastembed>=0.3.0 # HTTP httpx==0.28.1