This commit is contained in:
102
rag_indexer/cli.py
Executable file
102
rag_indexer/cli.py
Executable file
@@ -0,0 +1,102 @@
|
||||
"""
|
||||
Command-line interface for the RAG index builder.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from builder import IndexBuilder
|
||||
from splitters import SplitterType
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Offline RAG Index Builder")
|
||||
parser.add_argument("--file", type=str, help="Path to file to index")
|
||||
parser.add_argument("--dir", type=str, help="Path to directory to index")
|
||||
parser.add_argument("--recursive", action="store_true", default=True,
|
||||
help="Recursively process directories (default: True)")
|
||||
parser.add_argument("--collection", type=str, default="rag_documents",
|
||||
help="Qdrant collection name (default: rag_documents)")
|
||||
parser.add_argument("--qdrant-url", type=str,
|
||||
help="Qdrant server URL (default: http://127.0.0.1:6333)")
|
||||
parser.add_argument("--splitter", type=str,
|
||||
choices=["recursive", "semantic", "parent_child"],
|
||||
default="recursive",
|
||||
help="Text splitting strategy (default: recursive)")
|
||||
parser.add_argument("--chunk-size", type=int, default=500,
|
||||
help="Chunk size for recursive/parent splitter (default: 500)")
|
||||
parser.add_argument("--chunk-overlap", type=int, default=50,
|
||||
parser.add_argument("--docstore-path", type=str,
|
||||
default=None,
|
||||
help="Path to store parent documents for parent-child splitter (default: ./parent_docs or HERMES_HOME/parent_docs)")
|
||||
parser.add_argument("--docstore-type", type=str,
|
||||
choices=["local", "postgres"],
|
||||
default="local",
|
||||
help="Type of docstore: 'local' (default) or 'postgres' for PostgreSQL-backed storage")
|
||||
parser.add_argument("--docstore-conn", type=str,
|
||||
default=None,
|
||||
help="PostgreSQL connection string for postgres docstore")
|
||||
|
||||
help="Chunk overlap (default: 50)")
|
||||
parser.add_argument("--parent-size", type=int, default=1000,
|
||||
help="Parent chunk size for parent-child splitter (default: 1000)")
|
||||
parser.add_argument("--child-size", type=int, default=200,
|
||||
help="Child chunk size for parent-child splitter (default: 200)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.file and not args.dir:
|
||||
print("Error: Either --file or --dir must be specified", file=sys.stderr)
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
splitter_map = {
|
||||
"recursive": SplitterType.RECURSIVE,
|
||||
"semantic": SplitterType.SEMANTIC,
|
||||
"parent_child": SplitterType.PARENT_CHILD,
|
||||
}
|
||||
splitter_type = splitter_map[args.splitter]
|
||||
|
||||
splitter_kwargs = {}
|
||||
if splitter_type == SplitterType.RECURSIVE:
|
||||
splitter_kwargs["chunk_size"] = args.chunk_size
|
||||
splitter_kwargs["chunk_overlap"] = args.chunk_overlap
|
||||
elif splitter_type == SplitterType.PARENT_CHILD:
|
||||
splitter_kwargs["parent_chunk_size"] = args.parent_size
|
||||
splitter_kwargs["child_chunk_size"] = args.child_size
|
||||
splitter_kwargs["parent_chunk_overlap"] = args.chunk_overlap
|
||||
splitter_kwargs["child_chunk_overlap"] = args.chunk_overlap // 2
|
||||
splitter_kwargs["docstore_path"] = args.docstore_path
|
||||
splitter_kwargs["docstore_type"] = args.docstore_type
|
||||
splitter_kwargs["docstore_conn_string"] = args.docstore_conn
|
||||
|
||||
builder = IndexBuilder(
|
||||
collection_name=args.collection,
|
||||
qdrant_url=args.qdrant_url,
|
||||
splitter_type=splitter_type,
|
||||
**splitter_kwargs
|
||||
)
|
||||
|
||||
try:
|
||||
if args.file:
|
||||
chunk_count = builder.build_from_file(args.file)
|
||||
else:
|
||||
chunk_count = builder.build_from_directory(args.dir, args.recursive)
|
||||
|
||||
print(f"Indexing completed. Total chunks indexed: {chunk_count}")
|
||||
info = builder.get_collection_info()
|
||||
print(f"Collection '{info['name']}' has {info['vectors_count']} vectors (dim={info['vector_size']})")
|
||||
|
||||
except Exception as e:
|
||||
logging.exception("Indexing failed")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user