检索器重构
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 17m12s

This commit is contained in:
2026-04-19 22:01:55 +08:00
parent cc8ef41ef9
commit 933d418d77
26 changed files with 1694 additions and 1717 deletions

View File

@@ -1,85 +1,77 @@
"""
Command-line interface for the RAG index builder.
简易命令行入口,使用默认配置构建 RAG 索引。
"""
import argparse
import asyncio
import logging
import sys
from pathlib import Path
from rag_indexer.builder import IndexBuilder
from rag_indexer.IndexBuilder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# 基础配置
# 默认配置(所有连接参数从环境变量读取)
COLLECTION_NAME = "rag_documents"
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"
SPLITTER_TYPE = SplitterType.PARENT_CHILD
CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC
# 基础切分参数
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
# 父子块切分参数
# 父子块大小参数(可根据需要调整)
PARENT_CHUNK_SIZE = 1000
CHILD_CHUNK_SIZE = 200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_SIZE = 200
CHILD_CHUNK_OVERLAP = 20
SEARCH_K = 5
# 切分策略basic基础、semantic语义、parent-child父子块
STRATEGY = "parent-child"
# 存储类型postgresPostgreSQL、local本地文件
STORAGE_TYPE = "postgres"
def get_input_path() -> Path:
"""从命令行参数获取输入路径,若未提供则使用默认示例路径。"""
if len(sys.argv) > 1:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/user_docs/a.txt")
async def main():
# 使用固定策略
splitter_type = SplitterType.PARENT_CHILD
child_splitter_type = SplitterType.SEMANTIC
input_path = get_input_path()
if not input_path.exists():
logger.error("路径不存在: %s", input_path)
sys.exit(1)
splitter_kwargs = {}
if splitter_type == SplitterType.RECURSIVE:
splitter_kwargs["chunk_size"] = CHUNK_SIZE
splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
elif splitter_type == SplitterType.PARENT_CHILD:
splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
splitter_kwargs["child_splitter_type"] = child_splitter_type
if STORAGE_TYPE == "postgres":
splitter_kwargs["docstore_conn_string"] = DB_URI
elif STORAGE_TYPE == "local":
splitter_kwargs["docstore_path"] = "./parent_docs"
else:
splitter_kwargs["docstore_conn_string"] = DB_URI
builder = IndexBuilder(
# 构建配置(使用全部默认值)
config = IndexBuilderConfig(
collection_name=COLLECTION_NAME,
splitter_type=splitter_type,
**splitter_kwargs
splitter_type=SPLITTER_TYPE,
parent_chunk_size=PARENT_CHUNK_SIZE,
parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
child_chunk_size=CHILD_CHUNK_SIZE,
child_chunk_overlap=CHILD_CHUNK_OVERLAP,
child_splitter_type=CHILD_SPLITTER_TYPE,
search_k=SEARCH_K,
# docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接
)
is_file=False
path="data/corpus/"
builder = IndexBuilder(config)
is_directory = input_path.is_dir()
try:
if is_file:
chunk_count = await builder.build_from_file(path)
else:
chunk_count = await builder.build_from_directory(path, recursive=True)
async with builder:
if is_directory:
chunk_count = await builder.build_from_directory(input_path, recursive=True)
else:
chunk_count = await builder.build_from_file(input_path)
print(f"索引构建完成。共索引 {chunk_count} 个块")
print(f"\n索引构建完成。共索引 {chunk_count} 个块")
info = builder.get_collection_info()
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
except Exception as e:
logging.exception(f"索引构建失败{e}")
logger.exception("索引构建失败: %s", e)
sys.exit(1)