diff --git a/rag_indexer/index_builder.py b/rag_indexer/index_builder.py index 8970351..b60319e 100644 --- a/rag_indexer/index_builder.py +++ b/rag_indexer/index_builder.py @@ -71,6 +71,9 @@ class IndexBuilderConfig: # 其他切分器参数(当 splitter_type 非父子块时使用) extra_splitter_kwargs: Dict[str, Any] = field(default_factory=dict) + + # 混合检索支持(默认 False,完全兼容) + enable_sparse: bool = False # ---------- 索引构建器 ---------- class IndexBuilder: @@ -116,10 +119,27 @@ class IndexBuilder: self.embeddings = self.embedder.as_langchain_embeddings() # 初始化向量存储 - self.vector_store = QdrantVectorStore( - collection_name=config.collection_name, - embeddings=self.embeddings if self.embedder is None else None, - ) + # 默认 enable_sparse=False,完全兼容现有代码 + # 若需要启用混合检索,请先安装 fastembed,然后设置 enable_sparse=True + qdrant_kwargs = { + "collection_name": config.collection_name, + } + + if self.config.enable_sparse: + try: + from langchain_qdrant import FastEmbedSparse, RetrievalMode + qdrant_kwargs["sparse_embedding"] = FastEmbedSparse(model_name="Qdrant/bm25") + qdrant_kwargs["retrieval_mode"] = RetrievalMode.HYBRID + logger.info("✅ 稀疏向量支持已启用") + except ImportError: + logger.warning("⚠️ fastembed 未安装,无法启用稀疏向量,继续使用纯稠密") + except Exception as e: + logger.warning(f"⚠️ 稀疏向量初始化失败: {e},继续使用纯稠密") + + if self.embedder is None: + qdrant_kwargs["embedding"] = self.embeddings + + self.vector_store = QdrantVectorStore(**qdrant_kwargs) # 根据切分类型初始化相关组件 self._init_splitters_and_retriever() diff --git a/rag_indexer/requirements.txt b/rag_indexer/requirements.txt index b7e65d4..1b4460e 100644 --- a/rag_indexer/requirements.txt +++ b/rag_indexer/requirements.txt @@ -14,6 +14,8 @@ tiktoken>=0.12.0 # Vector DB qdrant-client==1.17.1 +# 可选:用于稀疏向量支持 +# fastembed>=0.3.0 # HTTP httpx==0.28.1