Files
ailine/rag_indexer/cli.py
2026-04-19 15:01:40 +08:00

87 lines
2.6 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Command-line interface for the RAG index builder.
"""
import argparse
import asyncio
import logging
import sys
from rag_indexer.builder import IndexBuilder
from rag_indexer.splitters import SplitterType
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
# 基础配置
COLLECTION_NAME = "rag_documents"
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"
# 基础切分参数
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
# 父子块切分参数
PARENT_CHUNK_SIZE = 1000
CHILD_CHUNK_SIZE = 200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_OVERLAP = 20
# 切分策略basic基础、semantic语义、parent-child父子块
STRATEGY = "parent-child"
# 存储类型postgresPostgreSQL、local本地文件
STORAGE_TYPE = "postgres"
async def main():
# 使用固定策略
splitter_type = SplitterType.PARENT_CHILD
child_splitter_type = SplitterType.SEMANTIC
splitter_kwargs = {}
if splitter_type == SplitterType.RECURSIVE:
splitter_kwargs["chunk_size"] = CHUNK_SIZE
splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
elif splitter_type == SplitterType.PARENT_CHILD:
splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
splitter_kwargs["child_splitter_type"] = child_splitter_type
if STORAGE_TYPE == "postgres":
splitter_kwargs["docstore_conn_string"] = DB_URI
elif STORAGE_TYPE == "local":
splitter_kwargs["docstore_path"] = "./parent_docs"
else:
splitter_kwargs["docstore_conn_string"] = DB_URI
builder = IndexBuilder(
collection_name=COLLECTION_NAME,
splitter_type=splitter_type,
**splitter_kwargs
)
is_file=False
path="data/corpus/"
try:
if is_file:
chunk_count = await builder.build_from_file(path)
else:
chunk_count = await builder.build_from_directory(path, recursive=True)
print(f"索引构建完成。共索引 {chunk_count} 个块")
info = builder.get_collection_info()
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
except Exception as e:
logging.exception(f"索引构建失败:{e}")
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())