修改配置

This commit is contained in:
2026-04-21 18:41:14 +08:00
parent 08826c70a3
commit e2eaac9498
12 changed files with 393 additions and 148 deletions

View File

@@ -26,8 +26,22 @@ Offline RAG Indexer module.
from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter
from .config import (
QDRANT_URL,
QDRANT_API_KEY,
LLAMACPP_EMBEDDING_URL,
LLAMACPP_API_KEY,
DB_URI,
DOCSTORE_URI,
RAG_OCR_LANGUAGES,
RAG_DOC_LANGUAGES,
)
# 从 rag_core 重新导出常用组件
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from rag_core import (
LlamaCppEmbedder,
QdrantVectorStore,
@@ -39,7 +53,7 @@ __version__ = "2.0.0"
__all__ = [
# 核心构建器与配置
"index_builder",
"IndexBuilder",
"IndexBuilderConfig",
"DocstoreConfig",
@@ -50,6 +64,16 @@ __all__ = [
"SplitterType",
"get_splitter",
# 配置
"QDRANT_URL",
"QDRANT_API_KEY",
"LLAMACPP_EMBEDDING_URL",
"LLAMACPP_API_KEY",
"DB_URI",
"DOCSTORE_URI",
"RAG_OCR_LANGUAGES",
"RAG_DOC_LANGUAGES",
# 嵌入与向量存储
"LlamaCppEmbedder",
"QdrantVectorStore",

View File

@@ -6,13 +6,24 @@ import asyncio
import logging
import sys
from pathlib import Path
from dotenv import load_dotenv
# 加载 .env 文件
load_dotenv()
# 添加项目根目录和 backend 目录到 Python 路径
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from .index_builder import IndexBuilder, IndexBuilderConfig
from .splitters import SplitterType
# 导入方式:条件导入,支持作为脚本运行和作为包导入
if __name__ == "__main__":
# 作为脚本直接运行时使用绝对导入
from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig
from rag_indexer.splitters import SplitterType
else:
# 作为包导入时使用相对导入
from .index_builder import IndexBuilder, IndexBuilderConfig
from .splitters import SplitterType
logging.basicConfig(
level=logging.INFO,

View File

@@ -1,32 +1,71 @@
"""
RAG Indexer 配置管理模块
集中管理所有环境变量配置项,避免散落在各个文件中
所有配置直接从环境变量读取,无默认值,避免配置混乱
需要类型转换的配置在此处理
"""
import os
# 尝试从 rag_core 导入配置(如果可用)
try:
from rag_core.config import (
QDRANT_URL,
QDRANT_API_KEY,
LLAMACPP_EMBEDDING_URL,
LLAMACPP_API_KEY,
DB_URI,
DOCSTORE_URI,
)
except ImportError:
# 如果 rag_core 不可用,则直接读取环境变量
QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "")
LLAMACPP_EMBEDDING_URL = os.getenv("LLAMACPP_EMBEDDING_URL", "http://127.0.0.1:8082")
LLAMACPP_API_KEY = os.getenv("LLAMACPP_API_KEY", "")
DB_URI = os.getenv(
"DB_URI",
"postgresql://postgres:huang1998@ai-postgres:5432/langgraph_db?sslmode=disable"
)
DOCSTORE_URI = os.getenv("DOCSTORE_URI", DB_URI)
# ========== 辅助函数:类型转换 ==========
def _get_str(key: str) -> str | None:
"""获取字符串配置"""
return os.getenv(key)
def _get_int(key: str) -> int | None:
"""获取整数配置,自动转换"""
value = os.getenv(key)
if value is not None:
try:
return int(value)
except (ValueError, TypeError):
pass
return None
def _get_list_str(key: str, default: list[str] | None = None) -> list[str]:
"""获取字符串列表配置,从逗号分隔的字符串解析"""
value = os.getenv(key)
if value is not None:
return [item.strip() for item in value.split(",") if item.strip()]
return default or []
# ========== 向量数据库配置URL + API密钥 配对) ==========
QDRANT_URL = _get_str("QDRANT_URL")
QDRANT_API_KEY = _get_str("QDRANT_API_KEY")
# ========== 嵌入服务配置URL + API密钥 配对) ==========
LLAMACPP_EMBEDDING_URL = _get_str("LLAMACPP_EMBEDDING_URL")
LLAMACPP_API_KEY = _get_str("LLAMACPP_API_KEY")
# ========== 文档存储配置(分离配置 + 完整URI ==========
# 分离配置(优先使用)
DB_HOST = _get_str("DB_HOST")
DB_PORT = _get_int("DB_PORT")
DB_USER = _get_str("DB_USER")
DB_PASSWORD = _get_str("DB_PASSWORD")
DB_NAME = _get_str("DB_NAME")
# 完整连接字符串(直接从环境变量读取)
DB_URI = _get_str("DB_URI")
# 文档存储 URI直接从环境变量读取默认同 DB_URI
DOCSTORE_URI = _get_str("DOCSTORE_URI") or DB_URI
# ========== 文档加载器配置unstructured 库) ==========
# OCR 语言列表(逗号分隔,如 "chi_sim,eng"
RAG_OCR_LANGUAGES = _get_list_str("RAG_OCR_LANGUAGES", ["chi_sim", "eng"])
# 文档主语言列表(逗号分隔,如 "zh"
RAG_DOC_LANGUAGES = _get_list_str("RAG_DOC_LANGUAGES", ["zh"])
# ========== 索引器专用配置 ==========
# 默认索引存储路径
INDEX_STORAGE_PATH = os.getenv("INDEX_STORAGE_PATH", "./index_storage")
INDEX_STORAGE_PATH = _get_str("INDEX_STORAGE_PATH")

View File

@@ -23,6 +23,12 @@ from qdrant_client.http.exceptions import ResponseHandlingException
from .loaders import DocumentLoader
from .splitters import SplitterType, get_splitter
# 从 rag_core 导入
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore, create_parent_retriever
logger = logging.getLogger(__name__)

View File

@@ -11,6 +11,9 @@ from langchain_core.documents import Document
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
# 相对导入配置
from .config import RAG_OCR_LANGUAGES, RAG_DOC_LANGUAGES
logger = logging.getLogger(__name__)
# 模块加载时设置一次环境变量,避免重复设置
@@ -47,8 +50,8 @@ class DocumentLoader:
"""
self.extract_images = extract_images
self.strategy = strategy
self.ocr_languages = ocr_languages or ["chi_sim", "eng"]
self.languages = languages or ["zh"]
self.ocr_languages = ocr_languages or RAG_OCR_LANGUAGES
self.languages = languages or RAG_DOC_LANGUAGES
self.include_page_breaks = include_page_breaks
self.pdf_infer_table_structure = pdf_infer_table_structure
self.partition_kwargs = partition_kwargs or {}