""" Command-line interface for the RAG index builder. """ import argparse import logging import sys from builder import IndexBuilder from splitters import SplitterType logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) def main(): parser = argparse.ArgumentParser(description="Offline RAG Index Builder") parser.add_argument("--file", type=str, help="Path to file to index") parser.add_argument("--dir", type=str, help="Path to directory to index") parser.add_argument("--recursive", action="store_true", default=True, help="Recursively process directories (default: True)") parser.add_argument("--collection", type=str, default="rag_documents", help="Qdrant collection name (default: rag_documents)") parser.add_argument("--qdrant-url", type=str, help="Qdrant server URL (default: http://127.0.0.1:6333)") parser.add_argument("--splitter", type=str, choices=["recursive", "semantic", "parent_child"], default="recursive", help="Text splitting strategy (default: recursive)") parser.add_argument("--chunk-size", type=int, default=500, help="Chunk size for recursive/parent splitter (default: 500)") parser.add_argument("--chunk-overlap", type=int, default=50, parser.add_argument("--docstore-path", type=str, default=None, help="Path to store parent documents for parent-child splitter (default: ./parent_docs or HERMES_HOME/parent_docs)") parser.add_argument("--docstore-type", type=str, choices=["local", "postgres"], default="local", help="Type of docstore: 'local' (default) or 'postgres' for PostgreSQL-backed storage") parser.add_argument("--docstore-conn", type=str, default=None, help="PostgreSQL connection string for postgres docstore") help="Chunk overlap (default: 50)") parser.add_argument("--parent-size", type=int, default=1000, help="Parent chunk size for parent-child splitter (default: 1000)") parser.add_argument("--child-size", type=int, default=200, help="Child chunk size for parent-child splitter (default: 200)") args = parser.parse_args() if not args.file and not args.dir: print("Error: Either --file or --dir must be specified", file=sys.stderr) parser.print_help() sys.exit(1) splitter_map = { "recursive": SplitterType.RECURSIVE, "semantic": SplitterType.SEMANTIC, "parent_child": SplitterType.PARENT_CHILD, } splitter_type = splitter_map[args.splitter] splitter_kwargs = {} if splitter_type == SplitterType.RECURSIVE: splitter_kwargs["chunk_size"] = args.chunk_size splitter_kwargs["chunk_overlap"] = args.chunk_overlap elif splitter_type == SplitterType.PARENT_CHILD: splitter_kwargs["parent_chunk_size"] = args.parent_size splitter_kwargs["child_chunk_size"] = args.child_size splitter_kwargs["parent_chunk_overlap"] = args.chunk_overlap splitter_kwargs["child_chunk_overlap"] = args.chunk_overlap // 2 splitter_kwargs["docstore_path"] = args.docstore_path splitter_kwargs["docstore_type"] = args.docstore_type splitter_kwargs["docstore_conn_string"] = args.docstore_conn builder = IndexBuilder( collection_name=args.collection, qdrant_url=args.qdrant_url, splitter_type=splitter_type, **splitter_kwargs ) try: if args.file: chunk_count = builder.build_from_file(args.file) else: chunk_count = builder.build_from_directory(args.dir, args.recursive) print(f"Indexing completed. Total chunks indexed: {chunk_count}") info = builder.get_collection_info() print(f"Collection '{info['name']}' has {info['vectors_count']} vectors (dim={info['vector_size']})") except Exception as e: logging.exception("Indexing failed") sys.exit(1) if __name__ == "__main__": main()