重排,多路查询
All checks were successful
构建并部署 AI Agent 服务 / deploy (push) Successful in 35m37s

This commit is contained in:
2026-04-20 01:10:18 +08:00
parent 933d418d77
commit 3c906e91d9
21 changed files with 728 additions and 635 deletions

View File

@@ -23,7 +23,7 @@ Offline RAG Indexer module.
>>> await builder.build_from_file("document.pdf")
"""
from .IndexBuilder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter
@@ -39,7 +39,7 @@ __version__ = "2.0.0"
__all__ = [
# 核心构建器与配置
"IndexBuilder",
"index_builder",
"IndexBuilderConfig",
"DocstoreConfig",

View File

@@ -7,7 +7,7 @@ import logging
import sys
from pathlib import Path
from rag_indexer.IndexBuilder import IndexBuilder, IndexBuilderConfig
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
logging.basicConfig(

View File

@@ -19,7 +19,8 @@ from langchain_classic.retrievers import ParentDocumentRetriever
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter, SemanticChunkerAdapter
from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore
from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore, create_parent_retriever
logger = logging.getLogger(__name__)
@@ -113,43 +114,40 @@ class IndexBuilder:
logger.info("使用单一 %s 切分器", self.config.splitter_type.value)
def _init_parent_child_mode(self) -> None:
"""父子块切分模式,初始化父块/子块切分器、文档存储和检索器。"""
cfg = self.config
# 父块切分器(始终使用递归切分
# 父块切分器(索引构建需要,必须保留
self.parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=cfg.parent_chunk_size,
chunk_overlap=cfg.parent_chunk_overlap,
)
# 子块切分器
# 子块切分器(索引构建需要)
if cfg.child_splitter_type == SplitterType.SEMANTIC:
self.child_splitter = get_splitter(
SplitterType.SEMANTIC,
embeddings=self.embeddings,
**cfg.extra_splitter_kwargs
)
logger.info("子块使用语义切分器")
else:
self.child_splitter = RecursiveCharacterTextSplitter(
chunk_size=cfg.child_chunk_size,
chunk_overlap=cfg.child_chunk_overlap,
)
logger.info("子块使用递归切分器,块大小=%d,重叠=%d",
cfg.child_chunk_size, cfg.child_chunk_overlap)
# 初始化文档存储(用于父块)
# 文档存储
self.docstore = self._create_or_use_docstore()
# 创建检索器
self.retriever = ParentDocumentRetriever(
vectorstore=self.vector_store.get_langchain_vectorstore(),
docstore=self.docstore,
child_splitter=self.child_splitter, # type: ignore[arg-type]
# 使用工厂函数创建检索器,避免重复代码
self.retriever = create_parent_retriever(
collection_name=cfg.collection_name,
embeddings=self.embeddings,
parent_splitter=self.parent_splitter,
search_kwargs={"k": cfg.search_k},
child_splitter=self.child_splitter,
docstore=self.docstore,
search_k=cfg.search_k,
)
logger.info("ParentDocumentRetriever 初始化完成,父块大小=%d", cfg.parent_chunk_size)
logger.info("ParentDocumentRetriever 初始化完成")
def _create_or_use_docstore(self) -> BaseStore:
"""创建或获取文档存储实例。"""

View File

@@ -10,7 +10,7 @@ import sys
# 添加项目根目录到 Python 路径
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from rag_indexer.IndexBuilder import IndexBuilder
from rag_indexer.index_builder import IndexBuilder
from rag_indexer.splitters import SplitterType
async def test_index_builder():

View File

@@ -129,7 +129,7 @@ async def check_postgres():
async def test_search():
"""测试检索功能。"""
from rag_indexer.IndexBuilder import IndexBuilder, IndexBuilderConfig
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
print("\n" + "=" * 60)