"""检查 Qdrant 中存储的向量质量。""" import os import sys import numpy as np from dotenv import load_dotenv from qdrant_client import QdrantClient # 添加项目根目录和 backend 目录到 Python 路径 project_root = os.path.join(os.path.dirname(__file__), "..") backend_dir = os.path.join(project_root, "backend") sys.path.insert(0, project_root) sys.path.insert(0, backend_dir) load_dotenv() from rag_core import LlamaCppEmbedder QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") COLLECTION_NAME = "rag_documents" client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) embedder = LlamaCppEmbedder() # 获取样本 points, _ = client.scroll( collection_name=COLLECTION_NAME, limit=1, with_vectors=True, with_payload=True, ) if not points: print(f"集合 '{COLLECTION_NAME}' 为空") exit() sample = points[0] raw_vec = sample.vector if isinstance(raw_vec, dict): stored_vec = list(raw_vec.values())[0] elif isinstance(raw_vec, list): stored_vec = raw_vec else: stored_vec = [] stored_payload = sample.payload or {} stored_text = str(stored_payload.get("page_content", ""))[:200] print(f"内容预览:\n{stored_text}...\n") print(f"向量维度: {len(stored_vec)}") # type: ignore print(f"前5个值: {stored_vec[:5]}") # type: ignore print(f"是否全零: {all(v == 0.0 for v in stored_vec)}") # type: ignore # 重新编码对比 if stored_text: new_vec = embedder.embed_query(stored_text) similarity = np.dot(stored_vec, new_vec) / (np.linalg.norm(stored_vec) * np.linalg.norm(new_vec)) # type: ignore print(f"\n重新编码前5个值: {new_vec[:5]}") print(f"余弦相似度: {similarity:.4f}") if similarity < 0.8: print("\n⚠️ 相似度过低,建议删除集合并重建索引") else: print("\n✅ 向量一致") else: print("\n⚠️ 样本无文本内容")