Files
ailine/rag_indexer/cli.py

87 lines
2.6 KiB
Python
Raw Normal View History

2026-04-18 16:56:23 +08:00
"""
Command-line interface for the RAG index builder.
"""
import argparse
2026-04-19 15:01:40 +08:00
import asyncio
2026-04-18 16:56:23 +08:00
import logging
import sys
2026-04-19 15:01:40 +08:00
from rag_indexer.builder import IndexBuilder
from rag_indexer.splitters import SplitterType
2026-04-18 16:56:23 +08:00
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
2026-04-19 15:01:40 +08:00
# 基础配置
COLLECTION_NAME = "rag_documents"
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"
2026-04-18 16:56:23 +08:00
2026-04-19 15:01:40 +08:00
# 基础切分参数
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
# 父子块切分参数
PARENT_CHUNK_SIZE = 1000
CHILD_CHUNK_SIZE = 200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_OVERLAP = 20
# 切分策略basic基础、semantic语义、parent-child父子块
STRATEGY = "parent-child"
# 存储类型postgresPostgreSQL、local本地文件
STORAGE_TYPE = "postgres"
2026-04-18 16:56:23 +08:00
2026-04-19 15:01:40 +08:00
async def main():
# 使用固定策略
splitter_type = SplitterType.PARENT_CHILD
child_splitter_type = SplitterType.SEMANTIC
2026-04-18 16:56:23 +08:00
splitter_kwargs = {}
2026-04-19 15:01:40 +08:00
2026-04-18 16:56:23 +08:00
if splitter_type == SplitterType.RECURSIVE:
2026-04-19 15:01:40 +08:00
splitter_kwargs["chunk_size"] = CHUNK_SIZE
splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
2026-04-18 16:56:23 +08:00
elif splitter_type == SplitterType.PARENT_CHILD:
2026-04-19 15:01:40 +08:00
splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
splitter_kwargs["child_splitter_type"] = child_splitter_type
if STORAGE_TYPE == "postgres":
splitter_kwargs["docstore_conn_string"] = DB_URI
elif STORAGE_TYPE == "local":
splitter_kwargs["docstore_path"] = "./parent_docs"
else:
splitter_kwargs["docstore_conn_string"] = DB_URI
2026-04-18 16:56:23 +08:00
builder = IndexBuilder(
2026-04-19 15:01:40 +08:00
collection_name=COLLECTION_NAME,
2026-04-18 16:56:23 +08:00
splitter_type=splitter_type,
**splitter_kwargs
)
2026-04-19 15:01:40 +08:00
is_file=False
path="data/corpus/"
2026-04-18 16:56:23 +08:00
try:
2026-04-19 15:01:40 +08:00
if is_file:
chunk_count = await builder.build_from_file(path)
2026-04-18 16:56:23 +08:00
else:
2026-04-19 15:01:40 +08:00
chunk_count = await builder.build_from_directory(path, recursive=True)
2026-04-18 16:56:23 +08:00
2026-04-19 15:01:40 +08:00
print(f"索引构建完成。共索引 {chunk_count} 个块")
2026-04-18 16:56:23 +08:00
info = builder.get_collection_info()
2026-04-19 15:01:40 +08:00
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
2026-04-18 16:56:23 +08:00
except Exception as e:
2026-04-19 15:01:40 +08:00
logging.exception(f"索引构建失败:{e}")
2026-04-18 16:56:23 +08:00
sys.exit(1)
if __name__ == "__main__":
2026-04-19 15:01:40 +08:00
asyncio.run(main())