2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
2026-04-19 22:01:55 +08:00
|
|
|
|
简易命令行入口,使用默认配置构建 RAG 索引。
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
import asyncio
|
2026-04-18 16:56:23 +08:00
|
|
|
|
import logging
|
|
|
|
|
|
import sys
|
2026-04-19 22:01:55 +08:00
|
|
|
|
from pathlib import Path
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-21 10:26:37 +08:00
|
|
|
|
# 添加项目根目录和 backend 目录到 Python 路径
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
|
|
|
|
|
|
|
|
|
|
|
|
from .index_builder import IndexBuilder, IndexBuilderConfig
|
|
|
|
|
|
from .splitters import SplitterType
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
|
|
|
|
)
|
2026-04-19 22:01:55 +08:00
|
|
|
|
logger = logging.getLogger(__name__)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 默认配置(所有连接参数从环境变量读取)
|
2026-04-19 15:01:40 +08:00
|
|
|
|
COLLECTION_NAME = "rag_documents"
|
2026-04-19 22:01:55 +08:00
|
|
|
|
SPLITTER_TYPE = SplitterType.PARENT_CHILD
|
|
|
|
|
|
CHILD_SPLITTER_TYPE = SplitterType.SEMANTIC
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 父子块大小参数(可根据需要调整)
|
2026-04-19 15:01:40 +08:00
|
|
|
|
PARENT_CHUNK_SIZE = 1000
|
|
|
|
|
|
PARENT_CHUNK_OVERLAP = 100
|
2026-04-19 22:01:55 +08:00
|
|
|
|
CHILD_CHUNK_SIZE = 200
|
2026-04-19 15:01:40 +08:00
|
|
|
|
CHILD_CHUNK_OVERLAP = 20
|
2026-04-19 22:01:55 +08:00
|
|
|
|
SEARCH_K = 5
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
def get_input_path() -> Path:
|
|
|
|
|
|
"""从命令行参数获取输入路径,若未提供则使用默认示例路径。"""
|
|
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
|
|
return Path(sys.argv[1])
|
|
|
|
|
|
# 默认测试路径(可按需修改)
|
|
|
|
|
|
return Path("data/user_docs/a.txt")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
|
|
|
|
|
async def main():
|
2026-04-19 22:01:55 +08:00
|
|
|
|
input_path = get_input_path()
|
|
|
|
|
|
if not input_path.exists():
|
|
|
|
|
|
logger.error("路径不存在: %s", input_path)
|
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
# 构建配置(使用全部默认值)
|
|
|
|
|
|
config = IndexBuilderConfig(
|
2026-04-19 15:01:40 +08:00
|
|
|
|
collection_name=COLLECTION_NAME,
|
2026-04-19 22:01:55 +08:00
|
|
|
|
splitter_type=SPLITTER_TYPE,
|
|
|
|
|
|
parent_chunk_size=PARENT_CHUNK_SIZE,
|
|
|
|
|
|
parent_chunk_overlap=PARENT_CHUNK_OVERLAP,
|
|
|
|
|
|
child_chunk_size=CHILD_CHUNK_SIZE,
|
|
|
|
|
|
child_chunk_overlap=CHILD_CHUNK_OVERLAP,
|
|
|
|
|
|
child_splitter_type=CHILD_SPLITTER_TYPE,
|
|
|
|
|
|
search_k=SEARCH_K,
|
|
|
|
|
|
# docstore 默认使用 create_docstore 从环境变量读取 PostgreSQL 连接
|
2026-04-18 16:56:23 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
builder = IndexBuilder(config)
|
|
|
|
|
|
is_directory = input_path.is_dir()
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
2026-04-18 16:56:23 +08:00
|
|
|
|
try:
|
2026-04-19 22:01:55 +08:00
|
|
|
|
async with builder:
|
|
|
|
|
|
if is_directory:
|
|
|
|
|
|
chunk_count = await builder.build_from_directory(input_path, recursive=True)
|
|
|
|
|
|
else:
|
|
|
|
|
|
chunk_count = await builder.build_from_file(input_path)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
print(f"\n索引构建完成。共索引 {chunk_count} 个块")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
info = builder.get_collection_info()
|
2026-04-19 15:01:40 +08:00
|
|
|
|
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']})")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
2026-04-19 22:01:55 +08:00
|
|
|
|
logger.exception("索引构建失败: %s", e)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2026-04-19 15:01:40 +08:00
|
|
|
|
asyncio.run(main())
|