Files
ailine/rag_indexer/cli.py
root 4209386c77
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 6m22s
refactor: 统一导入方式,移除 sys.path 操作
- 重构所有模块导入,移除 sys.path.insert
- 统一使用 from backend.xxx 的绝对导入方式
- rag_core 包内使用相对导入(from .xxx)
- 移动 visualize_graph.py 到 tools/ 目录
- 添加必要的 __init__.py 文件
- 清理废弃文档和脚本
2026-05-04 12:55:45 +08:00

83 lines
2.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
简易命令行入口,使用默认配置构建 RAG 索引。
"""
import asyncio
import logging
import sys
from pathlib import Path
from dotenv import load_dotenv
# 加载 .env 文件
load_dotenv()
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# 默认配置(所有连接参数从环境变量读取)
COLLECTION_NAME = "rag_documents"
SPLITTER_TYPE = SplitterType.PARENT_CHILD
CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC
# 父子块大小参数(可根据需要调整)
PARENT_CHUNK_SIZE = 1000
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_SIZE = 200
CHILD_CHUNK_OVERLAP = 20
SEARCH_K = 5
def get_input_path() -> Path:
"""从命令行参数获取输入路径,若未提供则使用默认示例路径。"""
if len(sys.argv) > 1:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/user_docs/doublestory.txt")
async def main():
input_path = get_input_path()
if not input_path.exists():
logger.error("路径不存在: %s", input_path)
sys.exit(1)
# 构建配置(使用全部默认值)
config = IndexBuilderConfig(
collection_name=COLLECTION_NAME,
splitter_type=SPLITTER_TYPE,
parent_chunk_size=PARENT_CHUNK_SIZE,
parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
child_chunk_size=CHILD_CHUNK_SIZE,
child_chunk_overlap=CHILD_CHUNK_OVERLAP,
child_splitter_type=CHILD_SPLITTER_TYPE,
search_k=SEARCH_K,
# docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接
)
builder = IndexBuilder(config)
is_directory = input_path.is_dir()
try:
async with builder:
if is_directory:
chunk_count = await builder.build_from_directory(input_path, recursive=True)
else:
chunk_count = await builder.build_from_file(input_path)
print(f"\n索引构建完成。共索引 {chunk_count} 个块")
info = builder.get_collection_info()
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
except Exception as e:
logger.exception("索引构建失败: %s", e)
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())