2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
2026-04-19 15:01:40 +08:00
|
|
|
|
文本切分器,用于将文档切分成块。
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
from enum import Enum
|
|
|
|
|
|
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
from langchain_core.documents import Document
|
2026-04-19 15:01:40 +08:00
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
|
2026-04-18 16:56:23 +08:00
|
|
|
|
from langchain_experimental.text_splitter import SemanticChunker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SplitterType(str, Enum):
|
|
|
|
|
|
RECURSIVE = "recursive"
|
|
|
|
|
|
SEMANTIC = "semantic"
|
|
|
|
|
|
PARENT_CHILD = "parent_child"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_splitter(splitter_type: SplitterType, **kwargs):
|
2026-04-19 15:01:40 +08:00
|
|
|
|
"""工厂函数,创建文本切分器。"""
|
2026-04-18 16:56:23 +08:00
|
|
|
|
if splitter_type == SplitterType.RECURSIVE:
|
|
|
|
|
|
chunk_size = kwargs.get("chunk_size", 500)
|
|
|
|
|
|
chunk_overlap = kwargs.get("chunk_overlap", 50)
|
|
|
|
|
|
return RecursiveCharacterTextSplitter(
|
|
|
|
|
|
chunk_size=chunk_size,
|
|
|
|
|
|
chunk_overlap=chunk_overlap,
|
|
|
|
|
|
separators=["\n\n", "\n", "。", "!", "?", " ", ""],
|
|
|
|
|
|
)
|
|
|
|
|
|
elif splitter_type == SplitterType.SEMANTIC:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
embeddings = kwargs.pop("embeddings", None)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
if embeddings is None:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
raise ValueError("语义切分器需要提供 'embeddings' 参数")
|
|
|
|
|
|
return SemanticChunkerAdapter(embeddings=embeddings, **kwargs)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
else:
|
2026-04-19 15:01:40 +08:00
|
|
|
|
raise ValueError(f"不支持的切分器类型: {splitter_type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SemanticChunkerAdapter(TextSplitter):
|
|
|
|
|
|
"""将 SemanticChunker 适配为 TextSplitter 接口。"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, embeddings, **kwargs):
|
|
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
|
|
chunk_size = kwargs.pop("chunk_size", None)
|
|
|
|
|
|
chunk_overlap = kwargs.pop("chunk_overlap", None)
|
|
|
|
|
|
self._chunker = SemanticChunker(embeddings=embeddings, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
|
|
|
|
return self._chunker.split_text(text)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ParentChildSplitter:
|
|
|
|
|
|
"""
|
2026-04-19 15:01:40 +08:00
|
|
|
|
将文档切分为父块(大块)和子块(小块)。
|
|
|
|
|
|
子块用于索引检索,父块用于存储上下文。
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
|
self,
|
|
|
|
|
|
parent_chunk_size: int = 1000,
|
|
|
|
|
|
child_chunk_size: int = 200,
|
|
|
|
|
|
parent_chunk_overlap: int = 100,
|
|
|
|
|
|
child_chunk_overlap: int = 20,
|
|
|
|
|
|
):
|
|
|
|
|
|
self.parent_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
|
|
chunk_size=parent_chunk_size,
|
|
|
|
|
|
chunk_overlap=parent_chunk_overlap,
|
|
|
|
|
|
)
|
|
|
|
|
|
self.child_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
|
|
chunk_size=child_chunk_size,
|
|
|
|
|
|
chunk_overlap=child_chunk_overlap,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def split_documents(self, documents: List[Document]) -> tuple[List[Document], List[Document]]:
|
|
|
|
|
|
"""
|
2026-04-19 15:01:40 +08:00
|
|
|
|
返回:
|
|
|
|
|
|
(父块列表, 子块列表)
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
parent_chunks = self.parent_splitter.split_documents(documents)
|
|
|
|
|
|
child_chunks = self.child_splitter.split_documents(documents)
|
|
|
|
|
|
|
2026-04-19 15:01:40 +08:00
|
|
|
|
# 将子块与父块 ID 关联(可选元数据)
|
|
|
|
|
|
# 在实际实现中,需要将每个子块映射到对应的父块 ID。
|
2026-04-18 16:56:23 +08:00
|
|
|
|
return parent_chunks, child_chunks
|