Files
ailine/rag_indexer/__init__.py

84 lines
1.9 KiB
Python
Raw Normal View History

2026-04-18 16:56:23 +08:00
"""
Offline RAG Indexer module.
2026-04-19 15:01:40 +08:00
提供完整的离线索引构建功能包括
- 文档加载PDFWordTXT
- 文本切分递归语义父子块
- 向量嵌入支持 llama.cpp
- 向量存储Qdrant
- 父文档存储PostgreSQL
示例用法
2026-04-19 22:01:55 +08:00
>>> from rag_indexer import IndexBuilder, IndexBuilderConfig, SplitterType
2026-04-19 15:01:40 +08:00
>>>
2026-04-19 22:01:55 +08:00
>>> config = IndexBuilderConfig(
2026-04-19 15:01:40 +08:00
... collection_name="my_docs",
... splitter_type=SplitterType.PARENT_CHILD,
... )
2026-04-19 22:01:55 +08:00
>>> builder = IndexBuilder(config)
2026-04-19 15:01:40 +08:00
>>>
2026-04-19 22:01:55 +08:00
>>> # 或直接传参(向后兼容)
>>> builder = IndexBuilder(collection_name="my_docs")
>>>
>>> await builder.build_from_file("document.pdf")
2026-04-18 16:56:23 +08:00
"""
2026-04-21 10:26:37 +08:00
from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter
2026-04-21 18:41:14 +08:00
from .config import (
QDRANT_URL,
QDRANT_API_KEY,
LLAMACPP_EMBEDDING_URL,
LLAMACPP_API_KEY,
DB_URI,
DOCSTORE_URI,
RAG_OCR_LANGUAGES,
RAG_DOC_LANGUAGES,
)
2026-04-18 16:56:23 +08:00
2026-04-19 22:01:55 +08:00
# 从 rag_core 重新导出常用组件
2026-04-21 18:41:14 +08:00
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
2026-04-21 20:49:10 +08:00
from backend.rag_core import (
2026-04-19 22:01:55 +08:00
LlamaCppEmbedder,
QdrantVectorStore,
2026-04-19 15:01:40 +08:00
PostgresDocStore,
create_docstore,
)
__version__ = "2.0.0"
2026-04-18 16:56:23 +08:00
__all__ = [
2026-04-19 22:01:55 +08:00
# 核心构建器与配置
2026-04-21 18:41:14 +08:00
"IndexBuilder",
2026-04-19 22:01:55 +08:00
"IndexBuilderConfig",
"DocstoreConfig",
# 加载器
"DocumentLoader",
2026-04-19 15:01:40 +08:00
# 切分相关
2026-04-18 16:56:23 +08:00
"SplitterType",
2026-04-19 15:01:40 +08:00
"get_splitter",
2026-04-21 18:41:14 +08:00
# 配置
"QDRANT_URL",
"QDRANT_API_KEY",
"LLAMACPP_EMBEDDING_URL",
"LLAMACPP_API_KEY",
"DB_URI",
"DOCSTORE_URI",
"RAG_OCR_LANGUAGES",
"RAG_DOC_LANGUAGES",
2026-04-19 22:01:55 +08:00
# 嵌入与向量存储
2026-04-18 16:56:23 +08:00
"LlamaCppEmbedder",
"QdrantVectorStore",
2026-04-19 15:01:40 +08:00
2026-04-19 22:01:55 +08:00
# 文档存储
2026-04-19 15:01:40 +08:00
"PostgresDocStore",
"create_docstore",
2026-04-19 22:01:55 +08:00
]