2026-04-19 22:01:55 +08:00
|
|
|
"""检查 Qdrant 中存储的向量质量。"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import numpy as np
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
from qdrant_client import QdrantClient
|
2026-04-21 16:27:05 +08:00
|
|
|
|
|
|
|
|
# 添加项目根目录和 backend 目录到 Python 路径
|
|
|
|
|
project_root = os.path.join(os.path.dirname(__file__), "..")
|
|
|
|
|
backend_dir = os.path.join(project_root, "backend")
|
|
|
|
|
sys.path.insert(0, project_root)
|
|
|
|
|
sys.path.insert(0, backend_dir)
|
2026-04-21 10:26:37 +08:00
|
|
|
load_dotenv()
|
2026-04-21 16:27:05 +08:00
|
|
|
|
|
|
|
|
from rag_core import LlamaCppEmbedder
|
2026-04-19 22:01:55 +08:00
|
|
|
|
|
|
|
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
|
|
|
|
|
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
|
|
|
|
COLLECTION_NAME = "rag_documents"
|
|
|
|
|
|
|
|
|
|
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
|
|
|
|
|
embedder = LlamaCppEmbedder()
|
|
|
|
|
|
|
|
|
|
# 获取样本
|
|
|
|
|
points, _ = client.scroll(
|
|
|
|
|
collection_name=COLLECTION_NAME,
|
|
|
|
|
limit=1,
|
|
|
|
|
with_vectors=True,
|
|
|
|
|
with_payload=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if not points:
|
|
|
|
|
print(f"集合 '{COLLECTION_NAME}' 为空")
|
|
|
|
|
exit()
|
|
|
|
|
|
|
|
|
|
sample = points[0]
|
|
|
|
|
raw_vec = sample.vector
|
|
|
|
|
if isinstance(raw_vec, dict):
|
|
|
|
|
stored_vec = list(raw_vec.values())[0]
|
|
|
|
|
elif isinstance(raw_vec, list):
|
|
|
|
|
stored_vec = raw_vec
|
|
|
|
|
else:
|
|
|
|
|
stored_vec = []
|
|
|
|
|
|
|
|
|
|
stored_payload = sample.payload or {}
|
|
|
|
|
stored_text = str(stored_payload.get("page_content", ""))[:200]
|
|
|
|
|
|
|
|
|
|
print(f"内容预览:\n{stored_text}...\n")
|
|
|
|
|
print(f"向量维度: {len(stored_vec)}") # type: ignore
|
|
|
|
|
print(f"前5个值: {stored_vec[:5]}") # type: ignore
|
|
|
|
|
print(f"是否全零: {all(v == 0.0 for v in stored_vec)}") # type: ignore
|
|
|
|
|
|
|
|
|
|
# 重新编码对比
|
|
|
|
|
if stored_text:
|
|
|
|
|
new_vec = embedder.embed_query(stored_text)
|
|
|
|
|
similarity = np.dot(stored_vec, new_vec) / (np.linalg.norm(stored_vec) * np.linalg.norm(new_vec)) # type: ignore
|
|
|
|
|
print(f"\n重新编码前5个值: {new_vec[:5]}")
|
|
|
|
|
print(f"余弦相似度: {similarity:.4f}")
|
|
|
|
|
|
|
|
|
|
if similarity < 0.8:
|
|
|
|
|
print("\n⚠️ 相似度过低,建议删除集合并重建索引")
|
|
|
|
|
else:
|
|
|
|
|
print("\n✅ 向量一致")
|
|
|
|
|
else:
|
|
|
|
|
print("\n⚠️ 样本无文本内容")
|