2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
Offline RAG Indexer module.
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
|
|
|
|
|
提供完整的离线索引构建功能,包括:
|
|
|
|
|
|
- 文档加载(PDF、Word、TXT 等)
|
|
|
|
|
|
- 文本切分(递归、语义、父子块)
|
|
|
|
|
|
- 向量嵌入(支持 llama.cpp)
|
|
|
|
|
|
- 向量存储(Qdrant)
|
|
|
|
|
|
- 父文档存储(PostgreSQL)
|
|
|
|
|
|
|
|
|
|
|
|
示例用法:
|
2026-04-19 22:01:55 +08:00
|
|
|
|
>>> from rag_indexer import IndexBuilder, IndexBuilderConfig, SplitterType
|
2026-04-19 15:01:40 +08:00
|
|
|
|
>>>
|
2026-04-19 22:01:55 +08:00
|
|
|
|
>>> config = IndexBuilderConfig(
|
2026-04-19 15:01:40 +08:00
|
|
|
|
... collection_name="my_docs",
|
|
|
|
|
|
... splitter_type=SplitterType.PARENT_CHILD,
|
|
|
|
|
|
... )
|
2026-04-19 22:01:55 +08:00
|
|
|
|
>>> builder = IndexBuilder(config)
|
2026-04-19 15:01:40 +08:00
|
|
|
|
>>>
|
2026-04-19 22:01:55 +08:00
|
|
|
|
>>> # 或直接传参(向后兼容)
|
|
|
|
|
|
>>> builder = IndexBuilder(collection_name="my_docs")
|
|
|
|
|
|
>>>
|
|
|
|
|
|
>>> await builder.build_from_file("document.pdf")
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"""
|
|
|
|
|
|
|
2026-04-21 10:26:37 +08:00
|
|
|
|
from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
|
|
|
|
|
|
from .loaders import DocumentLoader
|
|
|
|
|
|
from .splitters import SplitterType, get_splitter
|
2026-04-18 16:56:23 +08:00
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 从 rag_core 重新导出常用组件
|
|
|
|
|
|
from rag_core import (
|
|
|
|
|
|
LlamaCppEmbedder,
|
|
|
|
|
|
QdrantVectorStore,
|
2026-04-19 15:01:40 +08:00
|
|
|
|
PostgresDocStore,
|
|
|
|
|
|
create_docstore,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
__version__ = "2.0.0"
|
|
|
|
|
|
|
2026-04-18 16:56:23 +08:00
|
|
|
|
__all__ = [
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 核心构建器与配置
|
2026-04-20 01:10:18 +08:00
|
|
|
|
"index_builder",
|
2026-04-19 22:01:55 +08:00
|
|
|
|
"IndexBuilderConfig",
|
|
|
|
|
|
"DocstoreConfig",
|
|
|
|
|
|
|
|
|
|
|
|
# 加载器
|
|
|
|
|
|
"DocumentLoader",
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
|
|
|
|
|
# 切分相关
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"SplitterType",
|
2026-04-19 15:01:40 +08:00
|
|
|
|
"get_splitter",
|
|
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 嵌入与向量存储
|
2026-04-18 16:56:23 +08:00
|
|
|
|
"LlamaCppEmbedder",
|
|
|
|
|
|
"QdrantVectorStore",
|
2026-04-19 15:01:40 +08:00
|
|
|
|
|
2026-04-19 22:01:55 +08:00
|
|
|
|
# 文档存储
|
2026-04-19 15:01:40 +08:00
|
|
|
|
"PostgresDocStore",
|
|
|
|
|
|
"create_docstore",
|
2026-04-19 22:01:55 +08:00
|
|
|
|
]
|