""" BM25 稀疏嵌入器 基于 FastEmbed 的 Qdrant/bm25 模型,完全离线运行 """ from typing import List from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding from .config import FASTEMBED_CACHE_PATH class BM25SparseEmbedder: """BM25 稀疏嵌入包装器,与现有嵌入器风格统一""" def __init__(self): self.model = SparseTextEmbedding( model_name="Qdrant/bm25", cache_dir=FASTEMBED_CACHE_PATH, local_files_only=True, # 强制离线,永不联网 ) def embed_documents(self, texts: List[str]) -> List[dict]: """返回稀疏向量列表,每个为 Qdrant 兼容的 dict(indices+values)""" return [vec.as_object() for vec in self.model.embed(texts)] def embed_query(self, text: str) -> dict: """返回单个稀疏向量""" return list(self.model.embed([text]))[0].as_object() # 全局单例 _sparse_embedder_instance = None def get_sparse_embedder() -> BM25SparseEmbedder: global _sparse_embedder_instance if _sparse_embedder_instance is None: _sparse_embedder_instance = BM25SparseEmbedder() return _sparse_embedder_instance