Files
ailine/rag_indexer/cli.py
root 726236eaff
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 5m26s
重构代码,实现相对导入
2026-04-21 10:26:37 +08:00

83 lines
2.5 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
简易命令行入口,使用默认配置构建 RAG 索引。
"""
import asyncio
import logging
import sys
from pathlib import Path
# 添加项目根目录和 backend 目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from .index_builder import IndexBuilder, IndexBuilderConfig
from .splitters import SplitterType
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# 默认配置(所有连接参数从环境变量读取)
COLLECTION_NAME = "rag_documents"
SPLITTER_TYPE = SplitterType.PARENT_CHILD
CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC
# 父子块大小参数(可根据需要调整)
PARENT_CHUNK_SIZE = 1000
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_SIZE = 200
CHILD_CHUNK_OVERLAP = 20
SEARCH_K = 5
def get_input_path() -> Path:
"""从命令行参数获取输入路径,若未提供则使用默认示例路径。"""
if len(sys.argv) > 1:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/user_docs/a.txt")
async def main():
input_path = get_input_path()
if not input_path.exists():
logger.error("路径不存在: %s", input_path)
sys.exit(1)
# 构建配置(使用全部默认值)
config = IndexBuilderConfig(
collection_name=COLLECTION_NAME,
splitter_type=SPLITTER_TYPE,
parent_chunk_size=PARENT_CHUNK_SIZE,
parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
child_chunk_size=CHILD_CHUNK_SIZE,
child_chunk_overlap=CHILD_CHUNK_OVERLAP,
child_splitter_type=CHILD_SPLITTER_TYPE,
search_k=SEARCH_K,
# docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接
)
builder = IndexBuilder(config)
is_directory = input_path.is_dir()
try:
async with builder:
if is_directory:
chunk_count = await builder.build_from_directory(input_path, recursive=True)
else:
chunk_count = await builder.build_from_file(input_path)
print(f"\n索引构建完成。共索引 {chunk_count} 个块")
info = builder.get_collection_info()
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
except Exception as e:
logger.exception("索引构建失败: %s", e)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())