2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Command-line interface for the RAG index builder.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
2026-04-19 15:01:40 +08:00
|
|
|
|
import asyncio
|
2026-04-18 16:56:23 +08:00
|
|
|
|
import logging
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
from rag_indexer.builder import IndexBuilder
|
|
|
|
|
|
from rag_indexer.splitters import SplitterType
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
# 基础配置
|
|
|
|
|
|
COLLECTION_NAME = "rag_documents"
|
|
|
|
|
|
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
# 基础切分参数
|
|
|
|
|
|
CHUNK_SIZE = 500
|
|
|
|
|
|
CHUNK_OVERLAP = 50
|
|
|
|
|
|
|
|
|
|
|
|
# 父子块切分参数
|
|
|
|
|
|
PARENT_CHUNK_SIZE = 1000
|
|
|
|
|
|
CHILD_CHUNK_SIZE = 200
|
|
|
|
|
|
PARENT_CHUNK_OVERLAP = 100
|
|
|
|
|
|
CHILD_CHUNK_OVERLAP = 20
|
|
|
|
|
|
|
|
|
|
|
|
# 切分策略:basic(基础)、semantic(语义)、parent-child(父子块)
|
|
|
|
|
|
STRATEGY = "parent-child"
|
|
|
|
|
|
|
|
|
|
|
|
# 存储类型:postgres(PostgreSQL)、local(本地文件)
|
|
|
|
|
|
STORAGE_TYPE = "postgres"
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
|
|
|
|
|
async def main():
|
|
|
|
|
|
# 使用固定策略
|
|
|
|
|
|
splitter_type = SplitterType.PARENT_CHILD
|
|
|
|
|
|
child_splitter_type = SplitterType.SEMANTIC
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
splitter_kwargs = {}
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
2026-04-18 16:56:23 +08:00
|
|
|
|
if splitter_type == SplitterType.RECURSIVE:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
splitter_kwargs["chunk_size"] = CHUNK_SIZE
|
|
|
|
|
|
splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
|
2026-04-18 16:56:23 +08:00
|
|
|
|
elif splitter_type == SplitterType.PARENT_CHILD:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
|
|
|
|
|
|
splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
|
|
|
|
|
|
splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
|
|
|
|
|
|
splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
|
|
|
|
|
|
splitter_kwargs["child_splitter_type"] = child_splitter_type
|
|
|
|
|
|
if STORAGE_TYPE == "postgres":
|
|
|
|
|
|
splitter_kwargs["docstore_conn_string"] = DB_URI
|
|
|
|
|
|
elif STORAGE_TYPE == "local":
|
|
|
|
|
|
splitter_kwargs["docstore_path"] = "./parent_docs"
|
|
|
|
|
|
else:
|
|
|
|
|
|
splitter_kwargs["docstore_conn_string"] = DB_URI
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
builder = IndexBuilder(
|
2026-04-19 15:01:40 +08:00
|
|
|
|
collection_name=COLLECTION_NAME,
|
2026-04-18 16:56:23 +08:00
|
|
|
|
splitter_type=splitter_type,
|
|
|
|
|
|
**splitter_kwargs
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
is_file=False
|
|
|
|
|
|
path="data/corpus/"
|
|
|
|
|
|
|
2026-04-18 16:56:23 +08:00
|
|
|
|
try:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
if is_file:
|
|
|
|
|
|
chunk_count = await builder.build_from_file(path)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
else:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
chunk_count = await builder.build_from_directory(path, recursive=True)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
print(f"索引构建完成。共索引 {chunk_count} 个块")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
info = builder.get_collection_info()
|
2026-04-19 15:01:40 +08:00
|
|
|
|
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']})")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
logging.exception(f"索引构建失败:{e}")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-04-19 15:01:40 +08:00
|
|
|
|
asyncio.run(main())
|