""" Command-line interface for the RAG index builder. """ import argparse import asyncio import logging import sys from rag_indexer.builder import IndexBuilder from rag_indexer.splitters import SplitterType logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) # 基础配置 COLLECTION_NAME = "rag_documents" DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable" # 基础切分参数 CHUNK_SIZE = 500 CHUNK_OVERLAP = 50 # 父子块切分参数 PARENT_CHUNK_SIZE = 1000 CHILD_CHUNK_SIZE = 200 PARENT_CHUNK_OVERLAP = 100 CHILD_CHUNK_OVERLAP = 20 # 切分策略:basic(基础)、semantic(语义)、parent-child(父子块) STRATEGY = "parent-child" # 存储类型:postgres(PostgreSQL)、local(本地文件) STORAGE_TYPE = "postgres" async def main(): # 使用固定策略 splitter_type = SplitterType.PARENT_CHILD child_splitter_type = SplitterType.SEMANTIC splitter_kwargs = {} if splitter_type == SplitterType.RECURSIVE: splitter_kwargs["chunk_size"] = CHUNK_SIZE splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP elif splitter_type == SplitterType.PARENT_CHILD: splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP splitter_kwargs["child_splitter_type"] = child_splitter_type if STORAGE_TYPE == "postgres": splitter_kwargs["docstore_conn_string"] = DB_URI elif STORAGE_TYPE == "local": splitter_kwargs["docstore_path"] = "./parent_docs" else: splitter_kwargs["docstore_conn_string"] = DB_URI builder = IndexBuilder( collection_name=COLLECTION_NAME, splitter_type=splitter_type, **splitter_kwargs ) is_file=False path="data/corpus/" try: if is_file: chunk_count = await builder.build_from_file(path) else: chunk_count = await builder.build_from_directory(path, recursive=True) print(f"索引构建完成。共索引 {chunk_count} 个块") info = builder.get_collection_info() print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']})") except Exception as e: logging.exception(f"索引构建失败:{e}") sys.exit(1) if __name__ == "__main__": asyncio.run(main())