refactor: 重构RAG核心组件,简化代码结构和测试文件
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 6m53s
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 6m53s
This commit is contained in:
@@ -6,10 +6,6 @@ import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# 加载 .env 文件
|
||||
load_dotenv()
|
||||
|
||||
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
|
||||
from rag_indexer.splitters import SplitterType
|
||||
@@ -38,7 +34,7 @@ def get_input_path() -> Path:
|
||||
if len(sys.argv) > 1:
|
||||
return Path(sys.argv[1])
|
||||
# 默认测试路径(可按需修改)
|
||||
return Path("data/user_docs/doublestory.txt")
|
||||
return Path("data/corpus/三国演义.txt")
|
||||
|
||||
|
||||
async def main():
|
||||
|
||||
@@ -45,6 +45,11 @@ class IndexBuilderConfig:
|
||||
child_chunk_size: int = 200
|
||||
child_chunk_overlap: int = 20
|
||||
child_splitter_type: SplitterType = SplitterType.SEMANTIC # 子块默认语义切分
|
||||
# 子块语义切分参数
|
||||
child_buffer_size: int = 1
|
||||
child_breakpoint_threshold_type: str = "percentile"
|
||||
child_breakpoint_threshold_amount: float = 90 # 降低阈值,让切分更激进
|
||||
child_min_chunk_size: int = 50 # 降低最小块大小
|
||||
|
||||
# 检索参数
|
||||
search_k: int = 5
|
||||
@@ -86,7 +91,6 @@ class IndexBuilder:
|
||||
# 初始化向量存储(自动支持稠密+稀疏混合检索)
|
||||
self.vector_store = QdrantHybridStore(
|
||||
collection_name=config.collection_name,
|
||||
embeddings=self.embeddings,
|
||||
)
|
||||
logger.info("✅ 混合检索向量存储初始化成功(稠密+BM25稀疏)")
|
||||
|
||||
@@ -125,6 +129,10 @@ class IndexBuilder:
|
||||
self.child_splitter = get_splitter(
|
||||
SplitterType.SEMANTIC,
|
||||
embeddings=self.embeddings,
|
||||
buffer_size=cfg.child_buffer_size,
|
||||
breakpoint_threshold_type=cfg.child_breakpoint_threshold_type,
|
||||
breakpoint_threshold_amount=cfg.child_breakpoint_threshold_amount,
|
||||
min_chunk_size=cfg.child_min_chunk_size,
|
||||
**cfg.extra_splitter_kwargs
|
||||
)
|
||||
else:
|
||||
|
||||
34
rag_indexer/reset_qdrant.py
Normal file
34
rag_indexer/reset_qdrant.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
删除 Qdrant 集合并重新索引
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
from backend.rag_core import QdrantHybridStore
|
||||
|
||||
|
||||
async def delete_and_recreate():
|
||||
"""删除并重新创建集合"""
|
||||
print("="*70)
|
||||
print("删除旧集合并重新创建...")
|
||||
print("="*70)
|
||||
|
||||
vs = QdrantHybridStore(collection_name="rag_documents")
|
||||
|
||||
# 删除旧集合
|
||||
try:
|
||||
vs.delete_collection()
|
||||
print("✅ 旧集合已删除")
|
||||
except Exception as e:
|
||||
print(f"⚠️ 删除集合时出错(可能不存在): {e}")
|
||||
|
||||
# 重新创建
|
||||
vs.create_collection()
|
||||
print("✅ 新集合已创建")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(delete_and_recreate())
|
||||
Reference in New Issue
Block a user