""" 简易命令行入口,使用默认配置构建 RAG 索引。 """ import asyncio import logging import sys from pathlib import Path from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig from rag_indexer.splitters import SplitterType logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # 默认配置(所有连接参数从环境变量读取) COLLECTION_NAME = "rag_documents" SPLITTER_TYPE = SplitterType.PARENT_CHILD CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC # 父子块大小参数(可根据需要调整) PARENT_CHUNK_SIZE = 1000 PARENT_CHUNK_OVERLAP = 100 CHILD_CHUNK_SIZE = 200 CHILD_CHUNK_OVERLAP = 20 SEARCH_K = 5 def get_input_path() -> Path: """从命令行参数获取输入路径,若未提供则使用默认示例路径。""" if len(sys.argv) > 1: return Path(sys.argv[1]) # 默认测试路径(可按需修改) return Path("data/user_docs/a.txt") async def main(): input_path = get_input_path() if not input_path.exists(): logger.error("路径不存在: %s", input_path) sys.exit(1) # 构建配置(使用全部默认值) config = IndexBuilderConfig( collection_name=COLLECTION_NAME, splitter_type=SPLITTER_TYPE, parent_chunk_size=PARENT_CHUNK_SIZE, parent_chunk_overlap=PARENT_CHUNK_OVERLAP, child_chunk_size=CHILD_CHUNK_SIZE, child_chunk_overlap=CHILD_CHUNK_OVERLAP, child_splitter_type=CHILD_SPLITTER_TYPE, search_k=SEARCH_K, # docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接 ) builder = IndexBuilder(config) is_directory = input_path.is_dir() try: async with builder: if is_directory: chunk_count = await builder.build_from_directory(input_path, recursive=True) else: chunk_count = await builder.build_from_file(input_path) print(f"\n索引构建完成。共索引 {chunk_count} 个块") info = builder.get_collection_info() print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']})") except Exception as e: logger.exception("索引构建失败: %s", e) sys.exit(1) if __name__ == "__main__": asyncio.run(main())