This commit is contained in:
@@ -23,7 +23,7 @@ Offline RAG Indexer module.
|
||||
>>> await builder.build_from_file("document.pdf")
|
||||
"""
|
||||
|
||||
from .IndexBuilder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
|
||||
from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
|
||||
from .loaders import DocumentLoader
|
||||
from .splitters import SplitterType, get_splitter
|
||||
|
||||
@@ -39,7 +39,7 @@ __version__ = "2.0.0"
|
||||
|
||||
__all__ = [
|
||||
# 核心构建器与配置
|
||||
"IndexBuilder",
|
||||
"index_builder",
|
||||
"IndexBuilderConfig",
|
||||
"DocstoreConfig",
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from rag_indexer.IndexBuilder import IndexBuilder, IndexBuilderConfig
|
||||
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
|
||||
from rag_indexer.splitters import SplitterType
|
||||
|
||||
logging.basicConfig(
|
||||
|
||||
@@ -19,7 +19,8 @@ from langchain_classic.retrievers import ParentDocumentRetriever
|
||||
|
||||
from .loaders import DocumentLoader
|
||||
from .splitters import SplitterType, get_splitter, SemanticChunkerAdapter
|
||||
from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore
|
||||
from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore, create_parent_retriever
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -113,43 +114,40 @@ class IndexBuilder:
|
||||
logger.info("使用单一 %s 切分器", self.config.splitter_type.value)
|
||||
|
||||
def _init_parent_child_mode(self) -> None:
|
||||
"""父子块切分模式,初始化父块/子块切分器、文档存储和检索器。"""
|
||||
cfg = self.config
|
||||
|
||||
# 父块切分器(始终使用递归切分)
|
||||
# 父块切分器(索引构建需要,必须保留)
|
||||
self.parent_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=cfg.parent_chunk_size,
|
||||
chunk_overlap=cfg.parent_chunk_overlap,
|
||||
)
|
||||
|
||||
# 子块切分器
|
||||
# 子块切分器(索引构建需要)
|
||||
if cfg.child_splitter_type == SplitterType.SEMANTIC:
|
||||
self.child_splitter = get_splitter(
|
||||
SplitterType.SEMANTIC,
|
||||
embeddings=self.embeddings,
|
||||
**cfg.extra_splitter_kwargs
|
||||
)
|
||||
logger.info("子块使用语义切分器")
|
||||
else:
|
||||
self.child_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=cfg.child_chunk_size,
|
||||
chunk_overlap=cfg.child_chunk_overlap,
|
||||
)
|
||||
logger.info("子块使用递归切分器,块大小=%d,重叠=%d",
|
||||
cfg.child_chunk_size, cfg.child_chunk_overlap)
|
||||
|
||||
# 初始化文档存储(用于父块)
|
||||
# 文档存储
|
||||
self.docstore = self._create_or_use_docstore()
|
||||
|
||||
# 创建检索器
|
||||
self.retriever = ParentDocumentRetriever(
|
||||
vectorstore=self.vector_store.get_langchain_vectorstore(),
|
||||
docstore=self.docstore,
|
||||
child_splitter=self.child_splitter, # type: ignore[arg-type]
|
||||
# 使用工厂函数创建检索器,避免重复代码
|
||||
self.retriever = create_parent_retriever(
|
||||
collection_name=cfg.collection_name,
|
||||
embeddings=self.embeddings,
|
||||
parent_splitter=self.parent_splitter,
|
||||
search_kwargs={"k": cfg.search_k},
|
||||
child_splitter=self.child_splitter,
|
||||
docstore=self.docstore,
|
||||
search_k=cfg.search_k,
|
||||
)
|
||||
logger.info("ParentDocumentRetriever 初始化完成,父块大小=%d", cfg.parent_chunk_size)
|
||||
logger.info("ParentDocumentRetriever 初始化完成")
|
||||
|
||||
def _create_or_use_docstore(self) -> BaseStore:
|
||||
"""创建或获取文档存储实例。"""
|
||||
@@ -10,7 +10,7 @@ import sys
|
||||
# 添加项目根目录到 Python 路径
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from rag_indexer.IndexBuilder import IndexBuilder
|
||||
from rag_indexer.index_builder import IndexBuilder
|
||||
from rag_indexer.splitters import SplitterType
|
||||
|
||||
async def test_index_builder():
|
||||
|
||||
@@ -129,7 +129,7 @@ async def check_postgres():
|
||||
|
||||
async def test_search():
|
||||
"""测试检索功能。"""
|
||||
from rag_indexer.IndexBuilder import IndexBuilder, IndexBuilderConfig
|
||||
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
|
||||
from rag_indexer.splitters import SplitterType
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
Reference in New Issue
Block a user