refactor: 重构RAG核心组件,简化代码结构和测试文件
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 6m53s

This commit is contained in:
2026-05-04 17:58:10 +08:00
parent a07e398739
commit 9841f47432
31 changed files with 578 additions and 1496 deletions

View File

@@ -6,10 +6,6 @@ import asyncio
import logging
import sys
from pathlib import Path
from dotenv import load_dotenv
# 加载 .env 文件
load_dotenv()
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
@@ -38,7 +34,7 @@ def get_input_path() -> Path:
if len(sys.argv) > 1:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/user_docs/doublestory.txt")
return Path("data/corpus/三国演义.txt")
async def main():

View File

@@ -45,6 +45,11 @@ class IndexBuilderConfig:
child_chunk_size: int = 200
child_chunk_overlap: int = 20
child_splitter_type: SplitterType = SplitterType.SEMANTIC # 子块默认语义切分
# 子块语义切分参数
child_buffer_size: int = 1
child_breakpoint_threshold_type: str = "percentile"
child_breakpoint_threshold_amount: float = 90 # 降低阈值,让切分更激进
child_min_chunk_size: int = 50 # 降低最小块大小
# 检索参数
search_k: int = 5
@@ -86,7 +91,6 @@ class IndexBuilder:
# 初始化向量存储(自动支持稠密+稀疏混合检索)
self.vector_store = QdrantHybridStore(
collection_name=config.collection_name,
embeddings=self.embeddings,
)
logger.info("✅ 混合检索向量存储初始化成功(稠密+BM25稀疏")
@@ -125,6 +129,10 @@ class IndexBuilder:
self.child_splitter = get_splitter(
SplitterType.SEMANTIC,
embeddings=self.embeddings,
buffer_size=cfg.child_buffer_size,
breakpoint_threshold_type=cfg.child_breakpoint_threshold_type,
breakpoint_threshold_amount=cfg.child_breakpoint_threshold_amount,
min_chunk_size=cfg.child_min_chunk_size,
**cfg.extra_splitter_kwargs
)
else:

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
删除 Qdrant 集合并重新索引
"""
import asyncio
import os
import sys
from backend.rag_core import QdrantHybridStore
async def delete_and_recreate():
"""删除并重新创建集合"""
print("="*70)
print("删除旧集合并重新创建...")
print("="*70)
vs = QdrantHybridStore(collection_name="rag_documents")
# 删除旧集合
try:
vs.delete_collection()
print("✅ 旧集合已删除")
except Exception as e:
print(f"⚠️ 删除集合时出错(可能不存在): {e}")
# 重新创建
vs.create_collection()
print("✅ 新集合已创建")
if __name__ == "__main__":
asyncio.run(delete_and_recreate())