This commit is contained in:
124
rag_indexer/example_parent_child.py
Normal file
124
rag_indexer/example_parent_child.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
Example demonstrating ParentDocumentRetriever usage.
|
||||
|
||||
This script shows how to:
|
||||
1. Build an index with parent-child chunking
|
||||
2. Search with child chunks (fast, precise)
|
||||
3. Search with parent context (large context)
|
||||
4. Access the retriever directly for advanced use cases
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
from builder import IndexBuilder
|
||||
from splitters import SplitterType
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("ParentDocumentRetriever Example")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Create IndexBuilder with parent-child splitting
|
||||
print("\n1. Creating IndexBuilder with parent-child splitting...")
|
||||
builder = IndexBuilder(
|
||||
collection_name="parent_child_demo",
|
||||
splitter_type=SplitterType.PARENT_CHILD,
|
||||
parent_chunk_size=1000, # Parent chunks: larger context
|
||||
child_chunk_size=200, # Child chunks: smaller for precision
|
||||
docstore_path="./my_parent_docs", # Where to store parent chunks
|
||||
search_k=5, # Number of child chunks to retrieve
|
||||
)
|
||||
|
||||
print(f" Parent splitter: chunk_size={builder.get_parent_splitter().chunk_size}")
|
||||
print(f" Child splitter: chunk_size={builder.get_child_splitter().chunk_size}")
|
||||
print(f" Docstore path: {builder.get_docstore_path()}")
|
||||
print(f" Search k: {builder.retriever.search_kwargs['k']}")
|
||||
|
||||
# Step 2: Build index from a sample file
|
||||
print("\n2. Building index from sample file...")
|
||||
|
||||
# Create a test document
|
||||
test_content = """
|
||||
This is a test document for demonstrating ParentDocumentRetriever.
|
||||
|
||||
Parent chunks contain larger portions of text (1000 characters),
|
||||
while child chunks are smaller (200 characters) for precise retrieval.
|
||||
|
||||
When you search with ParentDocumentRetriever:
|
||||
- It first retrieves relevant child chunks
|
||||
- Then replaces them with their corresponding parent chunks
|
||||
- This gives you large context while maintaining precision
|
||||
|
||||
Example search queries:
|
||||
- "ParentDocumentRetriever"
|
||||
- "child chunks"
|
||||
- "large context"
|
||||
- "precise retrieval"
|
||||
"""
|
||||
|
||||
test_file = Path("./test_document.txt")
|
||||
test_file.write_text(test_content)
|
||||
|
||||
chunk_count = builder.build_from_file(str(test_file))
|
||||
print(f" Indexed {chunk_count} documents")
|
||||
|
||||
# Step 3: Search with child chunks (fast, precise)
|
||||
print("\n3. Searching with child chunks (fast, precise)...")
|
||||
child_results = builder.search("ParentDocumentRetriever", k=3)
|
||||
print(f" Found {len(child_results)} child chunks:")
|
||||
for i, doc in enumerate(child_results, 1):
|
||||
print(f" [{i}] {doc.page_content[:100]}...")
|
||||
|
||||
# Step 4: Search with parent context (large context)
|
||||
print("\n4. Searching with parent context (large context)...")
|
||||
parent_results = builder.search_with_parent_context("ParentDocumentRetriever", k=3)
|
||||
print(f" Found {len(parent_results)} parent chunks:")
|
||||
for i, doc in enumerate(parent_results, 1):
|
||||
print(f" [{i}] {doc.page_content[:150]}...")
|
||||
|
||||
# Step 5: Compare results
|
||||
print("\n5. Comparing child vs parent results...")
|
||||
print(f" Child chunks total length: {sum(len(d.page_content) for d in child_results)}")
|
||||
print(f" Parent chunks total length: {sum(len(d.page_content) for d in parent_results)}")
|
||||
print(f" Ratio: parent/child = {sum(len(d.page_content) for d in parent_results) / max(sum(len(d.page_content) for d in child_results), 1):.2f}x larger")
|
||||
|
||||
# Step 6: Access retriever directly
|
||||
print("\n6. Accessing retriever directly...")
|
||||
retriever = builder.get_retriever()
|
||||
print(f" Retriever type: {type(retriever).__name__}")
|
||||
print(f" Vectorstore: {retriever.vectorstore}")
|
||||
print(f" Docstore: {retriever.docstore}")
|
||||
|
||||
# Step 7: Unified retrieval interface
|
||||
print("\n7. Using unified retrieval interface...")
|
||||
unified_results = builder.retrieve("ParentDocumentRetriever", return_parent=True)
|
||||
print(f" Retrieved {len(unified_results)} documents (with parent context)")
|
||||
|
||||
# Step 8: Collection info
|
||||
print("\n8. Collection info...")
|
||||
info = builder.get_collection_info()
|
||||
print(f" Collection: {info['name']}")
|
||||
print(f" Vectors: {info['vectors_count']}")
|
||||
print(f" Vector size: {info['vector_size']}")
|
||||
|
||||
# Cleanup
|
||||
print("\n9. Cleaning up...")
|
||||
builder.close()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Example completed successfully!")
|
||||
print("=" * 70)
|
||||
|
||||
return builder
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
builder = main()
|
||||
Reference in New Issue
Block a user