34 lines
1.2 KiB
Python
34 lines
1.2 KiB
Python
|
|
"""
|
|||
|
|
BM25 稀疏嵌入器
|
|||
|
|
基于 FastEmbed 的 Qdrant/bm25 模型,完全离线运行
|
|||
|
|
"""
|
|||
|
|
from typing import List
|
|||
|
|
from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding
|
|||
|
|
from app.config import FASTEMBED_CACHE_PATH
|
|||
|
|
|
|||
|
|
class BM25SparseEmbedder:
|
|||
|
|
"""BM25 稀疏嵌入包装器,与现有嵌入器风格统一"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.model = SparseTextEmbedding(
|
|||
|
|
model_name="Qdrant/bm25",
|
|||
|
|
cache_dir=FASTEMBED_CACHE_PATH,
|
|||
|
|
local_files_only=True, # 强制离线,永不联网
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def embed_documents(self, texts: List[str]) -> List[dict]:
|
|||
|
|
"""返回稀疏向量列表,每个为 Qdrant 兼容的 dict(indices+values)"""
|
|||
|
|
return [vec.as_object() for vec in self.model.embed(texts)]
|
|||
|
|
|
|||
|
|
def embed_query(self, text: str) -> dict:
|
|||
|
|
"""返回单个稀疏向量"""
|
|||
|
|
return list(self.model.embed([text]))[0].as_object()
|
|||
|
|
|
|||
|
|
# 全局单例
|
|||
|
|
_sparse_embedder_instance = None
|
|||
|
|
|
|||
|
|
def get_sparse_embedder() -> BM25SparseEmbedder:
|
|||
|
|
global _sparse_embedder_instance
|
|||
|
|
if _sparse_embedder_instance is None:
|
|||
|
|
_sparse_embedder_instance = BM25SparseEmbedder()
|
|||
|
|
return _sparse_embedder_instance
|