RAG数据库生成

This commit is contained in:
2026-04-19 15:01:40 +08:00
parent c18e8a9860
commit cc8ef41ef9
17 changed files with 1089 additions and 577 deletions

View File

@@ -3,100 +3,85 @@ Command-line interface for the RAG index builder.
"""
import argparse
import asyncio
import logging
import sys
from builder import IndexBuilder
from splitters import SplitterType
from rag_indexer.builder import IndexBuilder
from rag_indexer.splitters import SplitterType
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
# 基础配置
COLLECTION_NAME = "rag_documents"
DB_URI = "postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable"
def main():
parser = argparse.ArgumentParser(description="Offline RAG Index Builder")
parser.add_argument("--file", type=str, help="Path to file to index")
parser.add_argument("--dir", type=str, help="Path to directory to index")
parser.add_argument("--recursive", action="store_true", default=True,
help="Recursively process directories (default: True)")
parser.add_argument("--collection", type=str, default="rag_documents",
help="Qdrant collection name (default: rag_documents)")
parser.add_argument("--qdrant-url", type=str,
help="Qdrant server URL (default: http://127.0.0.1:6333)")
parser.add_argument("--splitter", type=str,
choices=["recursive", "semantic", "parent_child"],
default="recursive",
help="Text splitting strategy (default: recursive)")
parser.add_argument("--chunk-size", type=int, default=500,
help="Chunk size for recursive/parent splitter (default: 500)")
parser.add_argument("--chunk-overlap", type=int, default=50,
parser.add_argument("--docstore-path", type=str,
default=None,
help="Path to store parent documents for parent-child splitter (default: ./parent_docs or HERMES_HOME/parent_docs)")
parser.add_argument("--docstore-type", type=str,
choices=["local", "postgres"],
default="local",
help="Type of docstore: 'local' (default) or 'postgres' for PostgreSQL-backed storage")
parser.add_argument("--docstore-conn", type=str,
default=None,
help="PostgreSQL connection string for postgres docstore")
# 基础切分参数
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
help="Chunk overlap (default: 50)")
parser.add_argument("--parent-size", type=int, default=1000,
help="Parent chunk size for parent-child splitter (default: 1000)")
parser.add_argument("--child-size", type=int, default=200,
help="Child chunk size for parent-child splitter (default: 200)")
# 父子块切分参数
PARENT_CHUNK_SIZE = 1000
CHILD_CHUNK_SIZE = 200
PARENT_CHUNK_OVERLAP = 100
CHILD_CHUNK_OVERLAP = 20
args = parser.parse_args()
# 切分策略basic基础、semantic语义、parent-child父子块
STRATEGY = "parent-child"
if not args.file and not args.dir:
print("Error: Either --file or --dir must be specified", file=sys.stderr)
parser.print_help()
sys.exit(1)
# 存储类型postgresPostgreSQL、local本地文件
STORAGE_TYPE = "postgres"
splitter_map = {
"recursive": SplitterType.RECURSIVE,
"semantic": SplitterType.SEMANTIC,
"parent_child": SplitterType.PARENT_CHILD,
}
splitter_type = splitter_map[args.splitter]
async def main():
# 使用固定策略
splitter_type = SplitterType.PARENT_CHILD
child_splitter_type = SplitterType.SEMANTIC
splitter_kwargs = {}
if splitter_type == SplitterType.RECURSIVE:
splitter_kwargs["chunk_size"] = args.chunk_size
splitter_kwargs["chunk_overlap"] = args.chunk_overlap
splitter_kwargs["chunk_size"] = CHUNK_SIZE
splitter_kwargs["chunk_overlap"] = CHUNK_OVERLAP
elif splitter_type == SplitterType.PARENT_CHILD:
splitter_kwargs["parent_chunk_size"] = args.parent_size
splitter_kwargs["child_chunk_size"] = args.child_size
splitter_kwargs["parent_chunk_overlap"] = args.chunk_overlap
splitter_kwargs["child_chunk_overlap"] = args.chunk_overlap // 2
splitter_kwargs["docstore_path"] = args.docstore_path
splitter_kwargs["docstore_type"] = args.docstore_type
splitter_kwargs["docstore_conn_string"] = args.docstore_conn
splitter_kwargs["parent_chunk_size"] = PARENT_CHUNK_SIZE
splitter_kwargs["child_chunk_size"] = CHILD_CHUNK_SIZE
splitter_kwargs["parent_chunk_overlap"] = PARENT_CHUNK_OVERLAP
splitter_kwargs["child_chunk_overlap"] = CHILD_CHUNK_OVERLAP
splitter_kwargs["child_splitter_type"] = child_splitter_type
if STORAGE_TYPE == "postgres":
splitter_kwargs["docstore_conn_string"] = DB_URI
elif STORAGE_TYPE == "local":
splitter_kwargs["docstore_path"] = "./parent_docs"
else:
splitter_kwargs["docstore_conn_string"] = DB_URI
builder = IndexBuilder(
collection_name=args.collection,
qdrant_url=args.qdrant_url,
collection_name=COLLECTION_NAME,
splitter_type=splitter_type,
**splitter_kwargs
)
try:
if args.file:
chunk_count = builder.build_from_file(args.file)
else:
chunk_count = builder.build_from_directory(args.dir, args.recursive)
is_file=False
path="data/corpus/"
print(f"Indexing completed. Total chunks indexed: {chunk_count}")
try:
if is_file:
chunk_count = await builder.build_from_file(path)
else:
chunk_count = await builder.build_from_directory(path, recursive=True)
print(f"索引构建完成。共索引 {chunk_count} 个块")
info = builder.get_collection_info()
print(f"Collection '{info['name']}' has {info['vectors_count']} vectors (dim={info['vector_size']})")
print(f"集合 '{info['name']}' 包含 {info['vectors_count']} 个向量(维度:{info['vector_size']}")
except Exception as e:
logging.exception("Indexing failed")
logging.exception(f"索引构建失败:{e}")
sys.exit(1)
if __name__ == "__main__":
main()
asyncio.run(main())