125 lines
4.5 KiB
Python
125 lines
4.5 KiB
Python
|
|
"""
|
||
|
|
Example demonstrating ParentDocumentRetriever usage.
|
||
|
|
|
||
|
|
This script shows how to:
|
||
|
|
1. Build an index with parent-child chunking
|
||
|
|
2. Search with child chunks (fast, precise)
|
||
|
|
3. Search with parent context (large context)
|
||
|
|
4. Access the retriever directly for advanced use cases
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# Configure logging
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||
|
|
)
|
||
|
|
|
||
|
|
from builder import IndexBuilder
|
||
|
|
from splitters import SplitterType
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("=" * 70)
|
||
|
|
print("ParentDocumentRetriever Example")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
# Step 1: Create IndexBuilder with parent-child splitting
|
||
|
|
print("\n1. Creating IndexBuilder with parent-child splitting...")
|
||
|
|
builder = IndexBuilder(
|
||
|
|
collection_name="parent_child_demo",
|
||
|
|
splitter_type=SplitterType.PARENT_CHILD,
|
||
|
|
parent_chunk_size=1000, # Parent chunks: larger context
|
||
|
|
child_chunk_size=200, # Child chunks: smaller for precision
|
||
|
|
docstore_path="./my_parent_docs", # Where to store parent chunks
|
||
|
|
search_k=5, # Number of child chunks to retrieve
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f" Parent splitter: chunk_size={builder.get_parent_splitter().chunk_size}")
|
||
|
|
print(f" Child splitter: chunk_size={builder.get_child_splitter().chunk_size}")
|
||
|
|
print(f" Docstore path: {builder.get_docstore_path()}")
|
||
|
|
print(f" Search k: {builder.retriever.search_kwargs['k']}")
|
||
|
|
|
||
|
|
# Step 2: Build index from a sample file
|
||
|
|
print("\n2. Building index from sample file...")
|
||
|
|
|
||
|
|
# Create a test document
|
||
|
|
test_content = """
|
||
|
|
This is a test document for demonstrating ParentDocumentRetriever.
|
||
|
|
|
||
|
|
Parent chunks contain larger portions of text (1000 characters),
|
||
|
|
while child chunks are smaller (200 characters) for precise retrieval.
|
||
|
|
|
||
|
|
When you search with ParentDocumentRetriever:
|
||
|
|
- It first retrieves relevant child chunks
|
||
|
|
- Then replaces them with their corresponding parent chunks
|
||
|
|
- This gives you large context while maintaining precision
|
||
|
|
|
||
|
|
Example search queries:
|
||
|
|
- "ParentDocumentRetriever"
|
||
|
|
- "child chunks"
|
||
|
|
- "large context"
|
||
|
|
- "precise retrieval"
|
||
|
|
"""
|
||
|
|
|
||
|
|
test_file = Path("./test_document.txt")
|
||
|
|
test_file.write_text(test_content)
|
||
|
|
|
||
|
|
chunk_count = builder.build_from_file(str(test_file))
|
||
|
|
print(f" Indexed {chunk_count} documents")
|
||
|
|
|
||
|
|
# Step 3: Search with child chunks (fast, precise)
|
||
|
|
print("\n3. Searching with child chunks (fast, precise)...")
|
||
|
|
child_results = builder.search("ParentDocumentRetriever", k=3)
|
||
|
|
print(f" Found {len(child_results)} child chunks:")
|
||
|
|
for i, doc in enumerate(child_results, 1):
|
||
|
|
print(f" [{i}] {doc.page_content[:100]}...")
|
||
|
|
|
||
|
|
# Step 4: Search with parent context (large context)
|
||
|
|
print("\n4. Searching with parent context (large context)...")
|
||
|
|
parent_results = builder.search_with_parent_context("ParentDocumentRetriever", k=3)
|
||
|
|
print(f" Found {len(parent_results)} parent chunks:")
|
||
|
|
for i, doc in enumerate(parent_results, 1):
|
||
|
|
print(f" [{i}] {doc.page_content[:150]}...")
|
||
|
|
|
||
|
|
# Step 5: Compare results
|
||
|
|
print("\n5. Comparing child vs parent results...")
|
||
|
|
print(f" Child chunks total length: {sum(len(d.page_content) for d in child_results)}")
|
||
|
|
print(f" Parent chunks total length: {sum(len(d.page_content) for d in parent_results)}")
|
||
|
|
print(f" Ratio: parent/child = {sum(len(d.page_content) for d in parent_results) / max(sum(len(d.page_content) for d in child_results), 1):.2f}x larger")
|
||
|
|
|
||
|
|
# Step 6: Access retriever directly
|
||
|
|
print("\n6. Accessing retriever directly...")
|
||
|
|
retriever = builder.get_retriever()
|
||
|
|
print(f" Retriever type: {type(retriever).__name__}")
|
||
|
|
print(f" Vectorstore: {retriever.vectorstore}")
|
||
|
|
print(f" Docstore: {retriever.docstore}")
|
||
|
|
|
||
|
|
# Step 7: Unified retrieval interface
|
||
|
|
print("\n7. Using unified retrieval interface...")
|
||
|
|
unified_results = builder.retrieve("ParentDocumentRetriever", return_parent=True)
|
||
|
|
print(f" Retrieved {len(unified_results)} documents (with parent context)")
|
||
|
|
|
||
|
|
# Step 8: Collection info
|
||
|
|
print("\n8. Collection info...")
|
||
|
|
info = builder.get_collection_info()
|
||
|
|
print(f" Collection: {info['name']}")
|
||
|
|
print(f" Vectors: {info['vectors_count']}")
|
||
|
|
print(f" Vector size: {info['vector_size']}")
|
||
|
|
|
||
|
|
# Cleanup
|
||
|
|
print("\n9. Cleaning up...")
|
||
|
|
builder.close()
|
||
|
|
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("Example completed successfully!")
|
||
|
|
print("=" * 70)
|
||
|
|
|
||
|
|
return builder
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
builder = main()
|