Files
ailine/backend/rag_core/sparse_embedder.py
root 60afa86ded
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Has been cancelled
feat: 实现 BM25 稀疏 + 稠密向量混合检索功能
2026-05-04 02:01:22 +08:00

34 lines
1.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
BM25 稀疏嵌入器
基于 FastEmbed 的 Qdrant/bm25 模型,完全离线运行
"""
from typing import List
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
from app.config import FASTEMBED_CACHE_PATH
class BM25SparseEmbedder:
"""BM25 稀疏嵌入包装器,与现有嵌入器风格统一"""
def __init__(self):
self.model = SparseTextEmbedding(
model_name="Qdrant/bm25",
cache_dir=FASTEMBED_CACHE_PATH,
local_files_only=True, # 强制离线,永不联网
)
def embed_documents(self, texts: List[str]) -> List[dict]:
"""返回稀疏向量列表,每个为 Qdrant 兼容的 dictindices+values"""
return [vec.as_object() for vec in self.model.embed(texts)]
def embed_query(self, text: str) -> dict:
"""返回单个稀疏向量"""
return list(self.model.embed([text]))[0].as_object()
# 全局单例
_sparse_embedder_instance = None
def get_sparse_embedder() -> BM25SparseEmbedder:
global _sparse_embedder_instance
if _sparse_embedder_instance is None:
_sparse_embedder_instance = BM25SparseEmbedder()
return _sparse_embedder_instance