This commit is contained in:
277
rag_indexer/builder.py
Normal file
277
rag_indexer/builder.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""
|
||||
Core pipeline builder for offline RAG index construction.
|
||||
|
||||
Now supports LangChain's ParentDocumentRetriever for parent-child chunking.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain.retrievers import ParentDocumentRetriever
|
||||
from langchain.storage import LocalFileStore, BaseStore
|
||||
|
||||
from .loaders import DocumentLoader
|
||||
from .splitters import SplitterType, get_splitter, ParentChildSplitter
|
||||
from .embedders import LlamaCppEmbedder
|
||||
from .vector_store import QdrantVectorStore
|
||||
from .docstore_manager import get_docstore, PostgresDocStore, create_docstore
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParentChildConfig:
|
||||
"""Configuration for parent-child splitting."""
|
||||
parent_chunk_size: int = 1000
|
||||
child_chunk_size: int = 200
|
||||
parent_chunk_overlap: int = 100
|
||||
child_chunk_overlap: int = 20
|
||||
search_k: int = 5
|
||||
docstore_path: str = None
|
||||
docstore_type: str = "local"
|
||||
docstore_conn_string: str = None
|
||||
|
||||
|
||||
class IndexBuilder:
|
||||
"""Main pipeline for RAG index construction."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection_name: str = "rag_documents",
|
||||
qdrant_url: str = None,
|
||||
splitter_type: SplitterType = SplitterType.RECURSIVE,
|
||||
**splitter_kwargs,
|
||||
):
|
||||
self.collection_name = collection_name
|
||||
self.qdrant_url = qdrant_url
|
||||
self.splitter_type = splitter_type
|
||||
self.splitter_kwargs = splitter_kwargs
|
||||
|
||||
# Components
|
||||
self.loader = DocumentLoader()
|
||||
self.embedder = LlamaCppEmbedder()
|
||||
self.embeddings = self.embedder.as_langchain_embeddings()
|
||||
|
||||
self.vector_store = QdrantVectorStore(
|
||||
collection_name=collection_name,
|
||||
embeddings=self.embeddings,
|
||||
qdrant_url=qdrant_url,
|
||||
)
|
||||
|
||||
# Splitter (except parent-child which is handled separately)
|
||||
if splitter_type != SplitterType.PARENT_CHILD:
|
||||
if splitter_type == SplitterType.SEMANTIC:
|
||||
splitter_kwargs["embeddings"] = self.embeddings
|
||||
self.splitter = get_splitter(splitter_type, **splitter_kwargs)
|
||||
else:
|
||||
self.splitter = None
|
||||
# Initialize ParentDocumentRetriever for parent-child splitting
|
||||
self._init_parent_child_retriever()
|
||||
|
||||
def _init_parent_child_retriever(self, **kwargs):
|
||||
"""
|
||||
Initialize ParentDocumentRetriever for parent-child chunking.
|
||||
|
||||
This replaces the custom ParentChildSplitter logic.
|
||||
"""
|
||||
# Parse kwargs for parent-child config
|
||||
parent_size = kwargs.get("parent_chunk_size", 1000)
|
||||
child_size = kwargs.get("child_chunk_size", 200)
|
||||
parent_overlap = kwargs.get("parent_chunk_overlap", kwargs.get("chunk_overlap", 100))
|
||||
child_overlap = kwargs.get("child_chunk_overlap", kwargs.get("chunk_overlap", 20))
|
||||
|
||||
# Define splitters
|
||||
self.parent_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=parent_size,
|
||||
chunk_overlap=parent_overlap,
|
||||
)
|
||||
self.child_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=child_size,
|
||||
chunk_overlap=child_overlap,
|
||||
)
|
||||
|
||||
# Vector store (for child chunks)
|
||||
self.vector_store_obj = self.vector_store.get_langchain_vectorstore()
|
||||
|
||||
# Document store (for parent chunks)
|
||||
docstore_path = kwargs.get("docstore_path")
|
||||
docstore_type = kwargs.get("docstore_type", "local")
|
||||
docstore_conn = kwargs.get("docstore_conn_string")
|
||||
|
||||
if docstore_type == "postgres" and docstore_conn:
|
||||
self.docstore = PostgresDocStore(docstore_conn)
|
||||
self._docstore_conn = docstore_conn
|
||||
else:
|
||||
self.docstore = get_docstore(docstore_path)
|
||||
self._docstore_conn = None
|
||||
|
||||
# Create retriever
|
||||
self.retriever = ParentDocumentRetriever(
|
||||
vectorstore=self.vector_store_obj,
|
||||
docstore=self.docstore,
|
||||
child_splitter=self.child_splitter,
|
||||
parent_splitter=self.parent_splitter,
|
||||
search_kwargs={"k": kwargs.get("search_k", 5)},
|
||||
)
|
||||
|
||||
def build_from_file(self, file_path: Union[str, Path]) -> int:
|
||||
logger.info("Loading file: %s", file_path)
|
||||
documents = self.loader.load_file(file_path)
|
||||
logger.info("Loaded %d documents", len(documents))
|
||||
return self._process_documents(documents)
|
||||
|
||||
def build_from_directory(self, directory_path: Union[str, Path], recursive: bool = True) -> int:
|
||||
logger.info("Loading directory: %s (recursive=%s)", directory_path, recursive)
|
||||
documents = self.loader.load_directory(directory_path, recursive=recursive)
|
||||
logger.info("Loaded %d documents from directory", len(documents))
|
||||
return self._process_documents(documents)
|
||||
|
||||
def _process_documents(self, documents: List[Document]) -> int:
|
||||
if not documents:
|
||||
logger.warning("No documents to process")
|
||||
return 0
|
||||
|
||||
if self.splitter_type == SplitterType.PARENT_CHILD:
|
||||
logger.info("Using LangChain ParentDocumentRetriever")
|
||||
|
||||
# Ensure collection exists for child chunks
|
||||
self.vector_store.create_collection()
|
||||
|
||||
# Use ParentDocumentRetriever to add documents
|
||||
# This automatically handles parent-child splitting, mapping, and retrieval
|
||||
self.retriever.add_documents(documents)
|
||||
|
||||
# Log estimated chunk counts
|
||||
estimated_parent_chunks = len(documents) * (self.parent_splitter._chunk_size // self.child_splitter._chunk_size)
|
||||
logger.info(
|
||||
"Indexed with ParentDocumentRetriever: "
|
||||
f"~{len(documents)} parent chunks, ~{estimated_parent_chunks} child chunks"
|
||||
)
|
||||
return len(documents)
|
||||
|
||||
else:
|
||||
logger.info("Splitting documents using %s", self.splitter_type)
|
||||
chunks = self.splitter.split_documents(documents)
|
||||
logger.info("Split into %d chunks", len(chunks))
|
||||
|
||||
self.vector_store.create_collection()
|
||||
self.vector_store.add_documents(chunks)
|
||||
return len(chunks)
|
||||
|
||||
def get_collection_info(self):
|
||||
return self.vector_store.get_collection_info()
|
||||
|
||||
def search(self, query: str, k: int = 5) -> List[Document]:
|
||||
"""Standard search - returns child chunks."""
|
||||
return self.vector_store.similarity_search(query, k=k)
|
||||
|
||||
def search_with_parent_context(self, query: str, k: int = 5) -> List[Document]:
|
||||
"""
|
||||
Search with parent context - returns full parent chunks.
|
||||
|
||||
This is the main retrieval method when using parent-child splitting.
|
||||
"""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
raise RuntimeError(
|
||||
"search_with_parent_context only available with PARENT_CHILD splitter. "
|
||||
"Use search() for standard retrieval."
|
||||
)
|
||||
return self.retriever.get_relevant_documents(query, k=k)
|
||||
|
||||
def retrieve(self, query: str, return_parent: bool = True) -> List[Document]:
|
||||
"""
|
||||
Unified retrieval interface.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
return_parent: If True and using parent-child splitter, return parent chunks
|
||||
If False, always return child chunks
|
||||
|
||||
Returns:
|
||||
List of relevant documents
|
||||
"""
|
||||
if self.splitter_type == SplitterType.PARENT_CHILD and return_parent:
|
||||
return self.search_with_parent_context(query)
|
||||
else:
|
||||
return self.search(query)
|
||||
|
||||
def get_retriever(self) -> ParentDocumentRetriever:
|
||||
"""
|
||||
Get the ParentDocumentRetriever instance directly.
|
||||
|
||||
Useful for advanced use cases where you want to access the retriever
|
||||
outside of IndexBuilder.
|
||||
"""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
raise RuntimeError(
|
||||
"get_retriever() only available with PARENT_CHILD splitter. "
|
||||
"Use search() or search_with_parent_context() for standard retrieval."
|
||||
)
|
||||
return self.retriever
|
||||
|
||||
def get_child_splitter(self) -> "RecursiveCharacterTextSplitter":
|
||||
"""Get the child splitter for reconfiguration."""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
return self.splitter
|
||||
return self.child_splitter
|
||||
|
||||
def get_parent_splitter(self) -> "RecursiveCharacterTextSplitter":
|
||||
"""Get the parent splitter for reconfiguration."""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
raise RuntimeError(
|
||||
"Parent splitter only available with PARENT_CHILD splitter."
|
||||
)
|
||||
return self.parent_splitter
|
||||
|
||||
def get_docstore(self) -> BaseStore:
|
||||
"""Get the document store for parent chunks."""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
raise RuntimeError(
|
||||
"Docstore only available with PARENT_CHILD splitter."
|
||||
)
|
||||
return self.docstore
|
||||
|
||||
def get_docstore_path(self) -> str:
|
||||
"""Get the document store path."""
|
||||
if self.splitter_type != SplitterType.PARENT_CHILD:
|
||||
raise RuntimeError(
|
||||
"Docstore path only available with PARENT_CHILD splitter."
|
||||
)
|
||||
return self.docstore.persist_path
|
||||
|
||||
def close(self):
|
||||
"""Close resources."""
|
||||
if hasattr(self, "_docstore_conn") and self._docstore_conn:
|
||||
import psycopg2
|
||||
conn = psycopg2.connect(self._docstore_conn)
|
||||
conn.close()
|
||||
logger.info("Closed PostgreSQL connection")
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
return False
|
||||
|
||||
|
||||
# RecursiveCharacterTextSplitter needs to be imported
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
builder = IndexBuilder(
|
||||
splitter_type=SplitterType.PARENT_CHILD,
|
||||
parent_chunk_size=1000,
|
||||
child_chunk_size=200,
|
||||
docstore_path="./my_parent_docs",
|
||||
)
|
||||
|
||||
print("Parent splitter:", builder.get_parent_splitter().chunk_size)
|
||||
print("Child splitter:", builder.get_child_splitter().chunk_size)
|
||||
print("Docstore path:", builder.get_docstore_path())
|
||||
print("Retriever:", builder.get_retriever())
|
||||
Reference in New Issue
Block a user