Files
ailine/rag_indexer/splitters.py
root c18e8a9860
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 32m6s
向量数据库
2026-04-18 16:56:23 +08:00

71 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Text splitters for chunking documents.
"""
from enum import Enum
from typing import List, Optional
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
class SplitterType(str, Enum):
RECURSIVE = "recursive"
SEMANTIC = "semantic"
PARENT_CHILD = "parent_child"
def get_splitter(splitter_type: SplitterType, **kwargs):
"""Factory function to create a text splitter."""
if splitter_type == SplitterType.RECURSIVE:
chunk_size = kwargs.get("chunk_size", 500)
chunk_overlap = kwargs.get("chunk_overlap", 50)
return RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", "", "", "", " ", ""],
)
elif splitter_type == SplitterType.SEMANTIC:
# Requires embeddings for semantic splitting
embeddings = kwargs.get("embeddings")
if embeddings is None:
raise ValueError("Semantic splitter requires 'embeddings' parameter")
return SemanticChunker(embeddings=embeddings)
else:
raise ValueError(f"Unsupported splitter type: {splitter_type}")
class ParentChildSplitter:
"""
Splits documents into parent (large) and child (small) chunks.
Child chunks are indexed for retrieval, parent chunks are stored for context.
"""
def __init__(
self,
parent_chunk_size: int = 1000,
child_chunk_size: int = 200,
parent_chunk_overlap: int = 100,
child_chunk_overlap: int = 20,
):
self.parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=parent_chunk_size,
chunk_overlap=parent_chunk_overlap,
)
self.child_splitter = RecursiveCharacterTextSplitter(
chunk_size=child_chunk_size,
chunk_overlap=child_chunk_overlap,
)
def split_documents(self, documents: List[Document]) -> tuple[List[Document], List[Document]]:
"""
Returns:
(parent_chunks, child_chunks)
"""
parent_chunks = self.parent_splitter.split_documents(documents)
child_chunks = self.child_splitter.split_documents(documents)
# Link child chunks to parent IDs (optional metadata)
# In a real implementation, you'd map each child to a parent chunk ID.
return parent_chunks, child_chunks