RAG数据库生成

This commit is contained in:
2026-04-19 15:01:40 +08:00
parent c18e8a9860
commit cc8ef41ef9
17 changed files with 1089 additions and 577 deletions

View File

@@ -1,12 +1,12 @@
"""
Text splitters for chunking documents.
文本切分器,用于将文档切分成块。
"""
from enum import Enum
from typing import List, Optional
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from langchain_experimental.text_splitter import SemanticChunker
@@ -17,7 +17,7 @@ class SplitterType(str, Enum):
def get_splitter(splitter_type: SplitterType, **kwargs):
"""Factory function to create a text splitter."""
"""工厂函数,创建文本切分器。"""
if splitter_type == SplitterType.RECURSIVE:
chunk_size = kwargs.get("chunk_size", 500)
chunk_overlap = kwargs.get("chunk_overlap", 50)
@@ -27,19 +27,31 @@ def get_splitter(splitter_type: SplitterType, **kwargs):
separators=["\n\n", "\n", "", "", "", " ", ""],
)
elif splitter_type == SplitterType.SEMANTIC:
# Requires embeddings for semantic splitting
embeddings = kwargs.get("embeddings")
embeddings = kwargs.pop("embeddings", None)
if embeddings is None:
raise ValueError("Semantic splitter requires 'embeddings' parameter")
return SemanticChunker(embeddings=embeddings)
raise ValueError("语义切分器需要提供 'embeddings' 参数")
return SemanticChunkerAdapter(embeddings=embeddings, **kwargs)
else:
raise ValueError(f"Unsupported splitter type: {splitter_type}")
raise ValueError(f"不支持的切分器类型: {splitter_type}")
class SemanticChunkerAdapter(TextSplitter):
"""将 SemanticChunker 适配为 TextSplitter 接口。"""
def __init__(self, embeddings, **kwargs):
super().__init__(**kwargs)
chunk_size = kwargs.pop("chunk_size", None)
chunk_overlap = kwargs.pop("chunk_overlap", None)
self._chunker = SemanticChunker(embeddings=embeddings, **kwargs)
def split_text(self, text: str) -> List[str]:
return self._chunker.split_text(text)
class ParentChildSplitter:
"""
Splits documents into parent (large) and child (small) chunks.
Child chunks are indexed for retrieval, parent chunks are stored for context.
将文档切分为父块(大块)和子块(小块)。
子块用于索引检索,父块用于存储上下文。
"""
def __init__(
@@ -60,12 +72,12 @@ class ParentChildSplitter:
def split_documents(self, documents: List[Document]) -> tuple[List[Document], List[Document]]:
"""
Returns:
(parent_chunks, child_chunks)
返回:
(父块列表, 子块列表)
"""
parent_chunks = self.parent_splitter.split_documents(documents)
child_chunks = self.child_splitter.split_documents(documents)
# Link child chunks to parent IDs (optional metadata)
# In a real implementation, you'd map each child to a parent chunk ID.
# 将子块与父块 ID 关联(可选元数据)
# 在实际实现中,需要将每个子块映射到对应的父块 ID
return parent_chunks, child_chunks