Files
ailine/rag_indexer/cli.py

94 lines
2.9 KiB
Python
Raw Normal View History

2026-04-18 16:56:23 +08:00
"""
2026-04-19 22:01:55 +08:00
简易命令行入口使用默认配置构建 RAG 索引
2026-04-18 16:56:23 +08:00
"""
2026-04-19 15:01:40 +08:00
import asyncio
2026-04-18 16:56:23 +08:00
import logging
import sys
2026-04-19 22:01:55 +08:00
from pathlib import Path
2026-04-21 18:41:14 +08:00
from dotenv import load_dotenv
# 加载 .env 文件
load_dotenv()
2026-04-18 16:56:23 +08:00
2026-04-21 10:26:37 +08:00
# 添加项目根目录和 backend 目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
2026-04-21 18:41:14 +08:00
# 导入方式:条件导入,支持作为脚本运行和作为包导入
if __name__ == "__main__":
# 作为脚本直接运行时使用绝对导入
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
else:
# 作为包导入时使用相对导入
from .index_builder import IndexBuilder, IndexBuilderConfig
from .splitters import SplitterType
2026-04-18 16:56:23 +08:00
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
2026-04-19 22:01:55 +08:00
logger = logging.getLogger(__name__)
2026-04-18 16:56:23 +08:00
2026-04-19 22:01:55 +08:00
# 默认配置(所有连接参数从环境变量读取)
2026-04-19 15:01:40 +08:00
COLLECTION_NAME = "rag_documents"
2026-04-19 22:01:55 +08:00
SPLITTER_TYPE = SplitterType.PARENT_CHILD
CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC
2026-04-18 16:56:23 +08:00
2026-04-19 22:01:55 +08:00
# 父子块大小参数(可根据需要调整)
2026-04-19 15:01:40 +08:00
PARENT_CHUNK_SIZE = 1000
PARENT_CHUNK_OVERLAP = 100
2026-04-19 22:01:55 +08:00
CHILD_CHUNK_SIZE = 200
2026-04-19 15:01:40 +08:00
CHILD_CHUNK_OVERLAP = 20
2026-04-19 22:01:55 +08:00
SEARCH_K = 5
2026-04-19 15:01:40 +08:00
2026-04-19 22:01:55 +08:00
def get_input_path() -> Path:
"""从命令行参数获取输入路径,若未提供则使用默认示例路径。"""
if len(sys.argv) > 1:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/user_docs/a.txt")
2026-04-18 16:56:23 +08:00
2026-04-19 15:01:40 +08:00
async def main():
2026-04-19 22:01:55 +08:00
input_path = get_input_path()
if not input_path.exists():
logger.error("路径不存在: %s", input_path)
sys.exit(1)
# 构建配置(使用全部默认值)
config = IndexBuilderConfig(
2026-04-19 15:01:40 +08:00
collection_name=COLLECTION_NAME,
2026-04-19 22:01:55 +08:00
splitter_type=SPLITTER_TYPE,
parent_chunk_size=PARENT_CHUNK_SIZE,
parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
child_chunk_size=CHILD_CHUNK_SIZE,
child_chunk_overlap=CHILD_CHUNK_OVERLAP,
child_splitter_type=CHILD_SPLITTER_TYPE,
search_k=SEARCH_K,
# docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接
2026-04-18 16:56:23 +08:00
)
2026-04-19 22:01:55 +08:00
builder = IndexBuilder(config)
is_directory = input_path.is_dir()
2026-04-19 15:01:40 +08:00
2026-04-18 16:56:23 +08:00
try:
2026-04-19 22:01:55 +08:00
async with builder:
if is_directory:
chunk_count = await builder.build_from_directory(input_path, recursive=True)
else:
chunk_count = await builder.build_from_file(input_path)
2026-04-18 16:56:23 +08:00
2026-04-19 22:01:55 +08:00
print(f"\n索引构建完成。共索引 {chunk_count} 个块")
2026-04-18 16:56:23 +08:00
info = builder.get_collection_info()
2026-04-19 15:01:40 +08:00
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
2026-04-18 16:56:23 +08:00
except Exception as e:
2026-04-19 22:01:55 +08:00
logger.exception("索引构建失败: %s", e)
2026-04-18 16:56:23 +08:00
sys.exit(1)
if __name__ == "__main__":
2026-04-19 15:01:40 +08:00
asyncio.run(main())