Files
ailine/rag_indexer/builder.py
root c18e8a9860
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 32m6s
向量数据库
2026-04-18 16:56:23 +08:00

278 lines
10 KiB
Python

"""
Core pipeline builder for offline RAG index construction.
Now supports LangChain's ParentDocumentRetriever for parent-child chunking.
"""
import logging
from pathlib import Path
from typing import List, Union, Optional, Tuple
from dataclasses import dataclass
from langchain_core.documents import Document
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore, BaseStore
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter, ParentChildSplitter
from .embedders import LlamaCppEmbedder
from .vector_store import QdrantVectorStore
from .docstore_manager import get_docstore, PostgresDocStore, create_docstore
logger = logging.getLogger(__name__)
@dataclass
class ParentChildConfig:
"""Configuration for parent-child splitting."""
parent_chunk_size: int = 1000
child_chunk_size: int = 200
parent_chunk_overlap: int = 100
child_chunk_overlap: int = 20
search_k: int = 5
docstore_path: str = None
docstore_type: str = "local"
docstore_conn_string: str = None
class IndexBuilder:
"""Main pipeline for RAG index construction."""
def __init__(
self,
collection_name: str = "rag_documents",
qdrant_url: str = None,
splitter_type: SplitterType = SplitterType.RECURSIVE,
**splitter_kwargs,
):
self.collection_name = collection_name
self.qdrant_url = qdrant_url
self.splitter_type = splitter_type
self.splitter_kwargs = splitter_kwargs
# Components
self.loader = DocumentLoader()
self.embedder = LlamaCppEmbedder()
self.embeddings = self.embedder.as_langchain_embeddings()
self.vector_store = QdrantVectorStore(
collection_name=collection_name,
embeddings=self.embeddings,
qdrant_url=qdrant_url,
)
# Splitter (except parent-child which is handled separately)
if splitter_type != SplitterType.PARENT_CHILD:
if splitter_type == SplitterType.SEMANTIC:
splitter_kwargs["embeddings"] = self.embeddings
self.splitter = get_splitter(splitter_type, **splitter_kwargs)
else:
self.splitter = None
# Initialize ParentDocumentRetriever for parent-child splitting
self._init_parent_child_retriever()
def _init_parent_child_retriever(self, **kwargs):
"""
Initialize ParentDocumentRetriever for parent-child chunking.
This replaces the custom ParentChildSplitter logic.
"""
# Parse kwargs for parent-child config
parent_size = kwargs.get("parent_chunk_size", 1000)
child_size = kwargs.get("child_chunk_size", 200)
parent_overlap = kwargs.get("parent_chunk_overlap", kwargs.get("chunk_overlap", 100))
child_overlap = kwargs.get("child_chunk_overlap", kwargs.get("chunk_overlap", 20))
# Define splitters
self.parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=parent_size,
chunk_overlap=parent_overlap,
)
self.child_splitter = RecursiveCharacterTextSplitter(
chunk_size=child_size,
chunk_overlap=child_overlap,
)
# Vector store (for child chunks)
self.vector_store_obj = self.vector_store.get_langchain_vectorstore()
# Document store (for parent chunks)
docstore_path = kwargs.get("docstore_path")
docstore_type = kwargs.get("docstore_type", "local")
docstore_conn = kwargs.get("docstore_conn_string")
if docstore_type == "postgres" and docstore_conn:
self.docstore = PostgresDocStore(docstore_conn)
self._docstore_conn = docstore_conn
else:
self.docstore = get_docstore(docstore_path)
self._docstore_conn = None
# Create retriever
self.retriever = ParentDocumentRetriever(
vectorstore=self.vector_store_obj,
docstore=self.docstore,
child_splitter=self.child_splitter,
parent_splitter=self.parent_splitter,
search_kwargs={"k": kwargs.get("search_k", 5)},
)
def build_from_file(self, file_path: Union[str, Path]) -> int:
logger.info("Loading file: %s", file_path)
documents = self.loader.load_file(file_path)
logger.info("Loaded %d documents", len(documents))
return self._process_documents(documents)
def build_from_directory(self, directory_path: Union[str, Path], recursive: bool = True) -> int:
logger.info("Loading directory: %s (recursive=%s)", directory_path, recursive)
documents = self.loader.load_directory(directory_path, recursive=recursive)
logger.info("Loaded %d documents from directory", len(documents))
return self._process_documents(documents)
def _process_documents(self, documents: List[Document]) -> int:
if not documents:
logger.warning("No documents to process")
return 0
if self.splitter_type == SplitterType.PARENT_CHILD:
logger.info("Using LangChain ParentDocumentRetriever")
# Ensure collection exists for child chunks
self.vector_store.create_collection()
# Use ParentDocumentRetriever to add documents
# This automatically handles parent-child splitting, mapping, and retrieval
self.retriever.add_documents(documents)
# Log estimated chunk counts
estimated_parent_chunks = len(documents) * (self.parent_splitter._chunk_size // self.child_splitter._chunk_size)
logger.info(
"Indexed with ParentDocumentRetriever: "
f"~{len(documents)} parent chunks, ~{estimated_parent_chunks} child chunks"
)
return len(documents)
else:
logger.info("Splitting documents using %s", self.splitter_type)
chunks = self.splitter.split_documents(documents)
logger.info("Split into %d chunks", len(chunks))
self.vector_store.create_collection()
self.vector_store.add_documents(chunks)
return len(chunks)
def get_collection_info(self):
return self.vector_store.get_collection_info()
def search(self, query: str, k: int = 5) -> List[Document]:
"""Standard search - returns child chunks."""
return self.vector_store.similarity_search(query, k=k)
def search_with_parent_context(self, query: str, k: int = 5) -> List[Document]:
"""
Search with parent context - returns full parent chunks.
This is the main retrieval method when using parent-child splitting.
"""
if self.splitter_type != SplitterType.PARENT_CHILD:
raise RuntimeError(
"search_with_parent_context only available with PARENT_CHILD splitter. "
"Use search() for standard retrieval."
)
return self.retriever.get_relevant_documents(query, k=k)
def retrieve(self, query: str, return_parent: bool = True) -> List[Document]:
"""
Unified retrieval interface.
Args:
query: Search query
return_parent: If True and using parent-child splitter, return parent chunks
If False, always return child chunks
Returns:
List of relevant documents
"""
if self.splitter_type == SplitterType.PARENT_CHILD and return_parent:
return self.search_with_parent_context(query)
else:
return self.search(query)
def get_retriever(self) -> ParentDocumentRetriever:
"""
Get the ParentDocumentRetriever instance directly.
Useful for advanced use cases where you want to access the retriever
outside of IndexBuilder.
"""
if self.splitter_type != SplitterType.PARENT_CHILD:
raise RuntimeError(
"get_retriever() only available with PARENT_CHILD splitter. "
"Use search() or search_with_parent_context() for standard retrieval."
)
return self.retriever
def get_child_splitter(self) -> "RecursiveCharacterTextSplitter":
"""Get the child splitter for reconfiguration."""
if self.splitter_type != SplitterType.PARENT_CHILD:
return self.splitter
return self.child_splitter
def get_parent_splitter(self) -> "RecursiveCharacterTextSplitter":
"""Get the parent splitter for reconfiguration."""
if self.splitter_type != SplitterType.PARENT_CHILD:
raise RuntimeError(
"Parent splitter only available with PARENT_CHILD splitter."
)
return self.parent_splitter
def get_docstore(self) -> BaseStore:
"""Get the document store for parent chunks."""
if self.splitter_type != SplitterType.PARENT_CHILD:
raise RuntimeError(
"Docstore only available with PARENT_CHILD splitter."
)
return self.docstore
def get_docstore_path(self) -> str:
"""Get the document store path."""
if self.splitter_type != SplitterType.PARENT_CHILD:
raise RuntimeError(
"Docstore path only available with PARENT_CHILD splitter."
)
return self.docstore.persist_path
def close(self):
"""Close resources."""
if hasattr(self, "_docstore_conn") and self._docstore_conn:
import psycopg2
conn = psycopg2.connect(self._docstore_conn)
conn.close()
logger.info("Closed PostgreSQL connection")
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False
# RecursiveCharacterTextSplitter needs to be imported
from langchain_text_splitters import RecursiveCharacterTextSplitter
if __name__ == "__main__":
# Example usage
builder = IndexBuilder(
splitter_type=SplitterType.PARENT_CHILD,
parent_chunk_size=1000,
child_chunk_size=200,
docstore_path="./my_parent_docs",
)
print("Parent splitter:", builder.get_parent_splitter().chunk_size)
print("Child splitter:", builder.get_child_splitter().chunk_size)
print("Docstore path:", builder.get_docstore_path())
print("Retriever:", builder.get_retriever())