""" Example demonstrating ParentDocumentRetriever usage. This script shows how to: 1. Build an index with parent-child chunking 2. Search with child chunks (fast, precise) 3. Search with parent context (large context) 4. Access the retriever directly for advanced use cases """ import logging from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) from builder import IndexBuilder from splitters import SplitterType def main(): print("=" * 70) print("ParentDocumentRetriever Example") print("=" * 70) # Step 1: Create IndexBuilder with parent-child splitting print("\n1. Creating IndexBuilder with parent-child splitting...") builder = IndexBuilder( collection_name="parent_child_demo", splitter_type=SplitterType.PARENT_CHILD, parent_chunk_size=1000, # Parent chunks: larger context child_chunk_size=200, # Child chunks: smaller for precision docstore_path="./my_parent_docs", # Where to store parent chunks search_k=5, # Number of child chunks to retrieve ) print(f" Parent splitter: chunk_size={builder.get_parent_splitter().chunk_size}") print(f" Child splitter: chunk_size={builder.get_child_splitter().chunk_size}") print(f" Docstore path: {builder.get_docstore_path()}") print(f" Search k: {builder.retriever.search_kwargs['k']}") # Step 2: Build index from a sample file print("\n2. Building index from sample file...") # Create a test document test_content = """ This is a test document for demonstrating ParentDocumentRetriever. Parent chunks contain larger portions of text (1000 characters), while child chunks are smaller (200 characters) for precise retrieval. When you search with ParentDocumentRetriever: - It first retrieves relevant child chunks - Then replaces them with their corresponding parent chunks - This gives you large context while maintaining precision Example search queries: - "ParentDocumentRetriever" - "child chunks" - "large context" - "precise retrieval" """ test_file = Path("./test_document.txt") test_file.write_text(test_content) chunk_count = builder.build_from_file(str(test_file)) print(f" Indexed {chunk_count} documents") # Step 3: Search with child chunks (fast, precise) print("\n3. Searching with child chunks (fast, precise)...") child_results = builder.search("ParentDocumentRetriever", k=3) print(f" Found {len(child_results)} child chunks:") for i, doc in enumerate(child_results, 1): print(f" [{i}] {doc.page_content[:100]}...") # Step 4: Search with parent context (large context) print("\n4. Searching with parent context (large context)...") parent_results = builder.search_with_parent_context("ParentDocumentRetriever", k=3) print(f" Found {len(parent_results)} parent chunks:") for i, doc in enumerate(parent_results, 1): print(f" [{i}] {doc.page_content[:150]}...") # Step 5: Compare results print("\n5. Comparing child vs parent results...") print(f" Child chunks total length: {sum(len(d.page_content) for d in child_results)}") print(f" Parent chunks total length: {sum(len(d.page_content) for d in parent_results)}") print(f" Ratio: parent/child = {sum(len(d.page_content) for d in parent_results) / max(sum(len(d.page_content) for d in child_results), 1):.2f}x larger") # Step 6: Access retriever directly print("\n6. Accessing retriever directly...") retriever = builder.get_retriever() print(f" Retriever type: {type(retriever).__name__}") print(f" Vectorstore: {retriever.vectorstore}") print(f" Docstore: {retriever.docstore}") # Step 7: Unified retrieval interface print("\n7. Using unified retrieval interface...") unified_results = builder.retrieve("ParentDocumentRetriever", return_parent=True) print(f" Retrieved {len(unified_results)} documents (with parent context)") # Step 8: Collection info print("\n8. Collection info...") info = builder.get_collection_info() print(f" Collection: {info['name']}") print(f" Vectors: {info['vectors_count']}") print(f" Vector size: {info['vector_size']}") # Cleanup print("\n9. Cleaning up...") builder.close() print("\n" + "=" * 70) print("Example completed successfully!") print("=" * 70) return builder if __name__ == "__main__": builder = main()