导入方式修改
Some checks failed
构建并部署 AI Agent 服务 / deploy (push) Failing after 6m44s

This commit is contained in:
2026-05-05 23:17:00 +08:00
parent b5c15ef445
commit 3ae9daa01a
51 changed files with 445 additions and 532 deletions

View File

@@ -35,6 +35,7 @@ def get_input_path() -> Path:
return Path(sys.argv[1])
# 默认测试路径(可按需修改)
return Path("data/corpus/三国演义.txt")
#return Path("data/user_docs/doublestory.txt")
async def main():

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python3
"""
删除 Qdrant 集合并重新索引
"""
import asyncio
import os
import sys
from backend.rag_core import QdrantHybridStore
async def delete_and_recreate():
"""删除并重新创建集合"""
print("="*70)
print("删除旧集合并重新创建...")
print("="*70)
vs = QdrantHybridStore(collection_name="rag_documents")
# 删除旧集合
try:
vs.delete_collection()
print("✅ 旧集合已删除")
except Exception as e:
print(f"⚠️ 删除集合时出错(可能不存在): {e}")
# 重新创建
vs.create_collection()
print("✅ 新集合已创建")
if __name__ == "__main__":
asyncio.run(delete_and_recreate())

View File

@@ -17,10 +17,8 @@ class SplitterType(str, Enum):
PARENT_CHILD = "parent_child"
# ---------- 配置数据类,统一参数 ----------
@dataclass
class RecursiveSplitterConfig:
"""递归字符切分器配置"""
chunk_size: int = 500
chunk_overlap: int = 50
separators: List[str] = field(default_factory=lambda: ["\n\n", "\n", "", "", "", " ", ""])
@@ -30,33 +28,31 @@ class RecursiveSplitterConfig:
@dataclass
class SemanticSplitterConfig:
"""语义切分器配置,仅包含 SemanticChunker 支持的参数。"""
embeddings: Any
buffer_size: int = 1
add_start_index: bool = False
breakpoint_threshold_type: str = "percentile"
breakpoint_threshold_amount: Optional[float] = None
breakpoint_threshold_amount: float = 0.6 # 非 None切分更积极
number_of_chunks: Optional[int] = None
sentence_split_regex: str = r"(?<=[.?!。?!])\s+"
sentence_split_regex: str = r"(?<=[。!?;.!?;])" # 中文友好
min_chunk_size: int = 100
@dataclass
class ParentChildSplitterConfig:
"""父子切分器配置"""
embeddings: Any # 子块语义切分所需
parent_chunk_size: int = 1000
parent_chunk_overlap: int = 100
child_buffer_size: int = 1
child_breakpoint_threshold_type: str = "percentile"
child_breakpoint_threshold_amount: Optional[float] = None
child_min_chunk_size: int = 100
child_max_chunk_size: Optional[int] = 200
embeddings: Any
# 语义切分(用于父块)
semantic_threshold_type: str = "percentile"
semantic_threshold_amount: float = 0.6
semantic_buffer_size: int = 1
semantic_min_chunk_size: int = 100
# 子块(递归字符切分)
child_chunk_size: int = 400
child_chunk_overlap: int = 50
# ---------- 适配器:让 SemanticChunker 实现 TextSplitter 接口 ----------
# ---------- 适配器 ----------
class SemanticChunkerAdapter(TextSplitter):
"""将 SemanticChunker 适配为 LangChain TextSplitter 接口。"""
def __init__(self, config: SemanticSplitterConfig, **kwargs):
super().__init__(**kwargs)
self._config = config
@@ -86,12 +82,8 @@ class SemanticChunkerAdapter(TextSplitter):
return result
# ---------- 工厂函数,统一创建切分器 ----------
# ---------- 工厂函数 ----------
def get_splitter(splitter_type: SplitterType, **kwargs) -> TextSplitter:
"""
根据类型创建切分器。
支持传入配置对象或直接参数。
"""
if splitter_type == SplitterType.RECURSIVE:
config = RecursiveSplitterConfig(
chunk_size=kwargs.get("chunk_size", 500),
@@ -114,98 +106,90 @@ def get_splitter(splitter_type: SplitterType, **kwargs) -> TextSplitter:
if "config" in kwargs and isinstance(kwargs["config"], SemanticSplitterConfig):
config = kwargs["config"]
else:
# 过滤出 SemanticSplitterConfig 支持的字段
config_kwargs = {
"embeddings": embeddings,
"buffer_size": kwargs.get("buffer_size", 1),
"breakpoint_threshold_type": kwargs.get("breakpoint_threshold_type", "percentile"),
"breakpoint_threshold_amount": kwargs.get("breakpoint_threshold_amount"),
"number_of_chunks": kwargs.get("number_of_chunks"),
"min_chunk_size": kwargs.get("min_chunk_size", 100),
}
config = SemanticSplitterConfig(**config_kwargs)
config = SemanticSplitterConfig(
embeddings=embeddings,
buffer_size=kwargs.get("buffer_size", 1),
breakpoint_threshold_type=kwargs.get("breakpoint_threshold_type", "percentile"),
breakpoint_threshold_amount=kwargs.get("breakpoint_threshold_amount", 0.6),
number_of_chunks=kwargs.get("number_of_chunks"),
min_chunk_size=kwargs.get("min_chunk_size", 100),
)
return SemanticChunkerAdapter(config)
elif splitter_type == SplitterType.PARENT_CHILD:
# 父子切分器在 builder 中单独处理,不通过本函数创建
raise ValueError("父子切分器应通过 IndexBuilder 创建,不支持 get_splitter 直接构建")
raise ValueError("父子切分器应通过 ParentChildSplitter 直接创建")
else:
raise ValueError(f"不支持的切分器类型: {splitter_type}")
# ---------- 父子切分器实现 ----------
# ---------- 父子切分器 ----------
class ParentChildSplitter:
"""
将文档切分为父块(大块,用于上下文)和子块(小块,用于索引检索)。
内部维护父子块之间的映射关系。
切分流程:
1. 语义切分 → 父块
2. 递归字符切分 → 子块
"""
def __init__(self, config: ParentChildSplitterConfig):
self.config = config
# 父块使用递归字符切分
self.parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.parent_chunk_size,
chunk_overlap=config.parent_chunk_overlap,
)
# 子块使用语义切分
# 语义切分(父块)
semantic_config = SemanticSplitterConfig(
embeddings=config.embeddings,
buffer_size=config.child_buffer_size,
breakpoint_threshold_type=config.child_breakpoint_threshold_type,
breakpoint_threshold_amount=config.child_breakpoint_threshold_amount,
min_chunk_size=config.child_min_chunk_size,
buffer_size=config.semantic_buffer_size,
breakpoint_threshold_type=config.semantic_threshold_type,
breakpoint_threshold_amount=config.semantic_threshold_amount,
min_chunk_size=config.semantic_min_chunk_size,
)
self.semantic_splitter = SemanticChunkerAdapter(semantic_config)
# 递归字符切分(子块,大小由 child_chunk_size 控制)
self.recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=config.child_chunk_size,
chunk_overlap=config.child_chunk_overlap,
separators=["\n\n", "\n", "", "", "", "", "", " ", ""]
)
self.child_splitter = SemanticChunkerAdapter(semantic_config)
# 存储父子块映射关系(可选)
self.parent_to_children: Dict[str, List[str]] = {}
self.child_to_parent: Dict[str, str] = {}
def split_documents(self, documents: List[Document]) -> Tuple[List[Document], List[Document]]:
"""
返回:
(父块列表, 子块列表)
同时填充内部映射字典。
"""
parent_chunks = self.parent_splitter.split_documents(documents)
child_chunks = self.child_splitter.split_documents(documents)
parent_chunks = []
child_chunks = []
# 建立映射关系(简化示例:根据文本包含关系粗略匹配,实际需更精确的算法)
# 这里仅作示意,生产环境建议使用 embedding 相似度或精确子串定位
self._build_mappings(parent_chunks, child_chunks)
for doc in documents:
# Step 1: 语义切分(父块)
semantic_blocks = self.semantic_splitter.split_text(doc.page_content)
for p_idx, semantic_block in enumerate(semantic_blocks):
parent_id = f"parent_{len(parent_chunks)}"
parent_doc = Document(
page_content=semantic_block,
metadata={**doc.metadata, "id": parent_id, "chunk_index": p_idx}
)
parent_chunks.append(parent_doc)
# Step 2: 递归字符切分(子块)
sub_chunks = self.recursive_splitter.split_text(semantic_block)
for c_idx, sub_chunk in enumerate(sub_chunks):
child_id = f"child_{len(child_chunks)}"
child_doc = Document(
page_content=sub_chunk,
metadata={**doc.metadata, "id": child_id, "parent_id": parent_id, "child_index": c_idx}
)
child_chunks.append(child_doc)
self.child_to_parent[child_id] = parent_id
if parent_id not in self.parent_to_children:
self.parent_to_children[parent_id] = []
self.parent_to_children[parent_id].append(child_id)
return parent_chunks, child_chunks
def _build_mappings(self, parents: List[Document], children: List[Document]) -> None:
"""
根据文本内容建立父子映射。
本方法为简化实现,实际使用时请替换为更可靠的匹配逻辑。
"""
self.parent_to_children.clear()
self.child_to_parent.clear()
# 为每个父块生成唯一 ID若无则使用索引
for p_idx, parent in enumerate(parents):
parent_id = parent.metadata.get("id", f"parent_{p_idx}")
parent.metadata["id"] = parent_id
self.parent_to_children[parent_id] = []
# 将每个子块分配给包含其文本的第一个父块
for c_idx, child in enumerate(children):
child_id = child.metadata.get("id", f"child_{c_idx}")
child.metadata["id"] = child_id
for parent in parents:
if child.page_content in parent.page_content:
parent_id = parent.metadata["id"]
self.parent_to_children[parent_id].append(child_id)
self.child_to_parent[child_id] = parent_id
child.metadata["parent_id"] = parent_id
break
def get_parent_for_child(self, child_id: str) -> Optional[str]:
"""根据子块 ID 获取父块 ID"""
return self.child_to_parent.get(child_id)
def get_children_for_parent(self, parent_id: str) -> List[str]:
"""根据父块 ID 获取所有子块 ID"""
return self.parent_to_children.get(parent_id, [])
return self.parent_to_children.get(parent_id, [])