ailine/rag_indexer/cli.py

"""
Command-line interface for the RAG index builder.
"""

import argparse
import asyncio
import logging
import sys

from rag_indexer.builder import IndexBuilder
from rag_indexer.splitters import SplitterType

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

# 基础配置
COLLECTION_NAME = "rag_documents"
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"

# 基础切分参数
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

# 父子块切分参数
PARENT_CHUNK_SIZE = 1000
CHILD_CHUNK_SIZE = 200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_OVERLAP = 20

# 切分策略：basic（基础）、semantic（语义）、parent-child（父子块）
STRATEGY = "parent-child"

# 存储类型：postgres（PostgreSQL）、local（本地文件）
STORAGE_TYPE = "postgres"


async def main():
    # 使用固定策略
    splitter_type = SplitterType.PARENT_CHILD
    child_splitter_type = SplitterType.SEMANTIC

    splitter_kwargs = {}

    if splitter_type == SplitterType.RECURSIVE:
        splitter_kwargs["chunk_size"] = CHUNK_SIZE
        splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
    elif splitter_type == SplitterType.PARENT_CHILD:
        splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
        splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
        splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
        splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
        splitter_kwargs["child_splitter_type"] = child_splitter_type
        if STORAGE_TYPE == "postgres":
            splitter_kwargs["docstore_conn_string"] = DB_URI
        elif STORAGE_TYPE == "local":
            splitter_kwargs["docstore_path"] = "./parent_docs"
        else:
            splitter_kwargs["docstore_conn_string"] = DB_URI

    builder = IndexBuilder(
        collection_name=COLLECTION_NAME,
        splitter_type=splitter_type,
        **splitter_kwargs
    )

    is_file=False    
    path="data/corpus/"

    try:
        if is_file:
            chunk_count = await builder.build_from_file(path)
        else:
            chunk_count = await builder.build_from_directory(path, recursive=True)

        print(f"索引构建完成。共索引 {chunk_count} 个块")
        info = builder.get_collection_info()
        print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量（维度：{info['vector_size']}）")

    except Exception as e:
        logging.exception(f"索引构建失败：{e}")   
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())