From e2eaac949886cde3350a7daea6c26dfc11bb036c Mon Sep 17 00:00:00 2001 From: root <953994191@qq.com> Date: Tue, 21 Apr 2026 18:41:14 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.docker | 73 +++++++++++------------ backend/app/config.py | 109 ++++++++++++++++++++++++----------- backend/rag_core/config.py | 56 +++++++++++++----- docker/backend/Dockerfile | 6 +- docker/docker-compose.yml | 19 +++--- frontend/src/config.py | 72 ++++++++++++++++------- rag_indexer/__init__.py | 26 ++++++++- rag_indexer/cli.py | 15 ++++- rag_indexer/config.py | 83 +++++++++++++++++++------- rag_indexer/index_builder.py | 6 ++ rag_indexer/loaders.py | 7 ++- test/test_frontend.py | 69 ++++++++++++++++++++++ 12 files changed, 393 insertions(+), 148 deletions(-) create mode 100644 test/test_frontend.py diff --git a/.env.docker b/.env.docker index d08f87b..65c2efe 100644 --- a/.env.docker +++ b/.env.docker @@ -10,7 +10,38 @@ ZHIPUAI_API_KEY=your_zhipuai_api_key_here DEEPSEEK_API_KEY=your_deepseek_api_key_here # llama.cpp 服务认证 Token(与容器启动参数一致) -LLAMACPP_API_KEY=token-abc123 +LLAMACPP_API_KEY=huang1998 + +# ----------------------------------------------------------------------------- +# PostgreSQL 数据库配置(分离配置,易于管理) +# ----------------------------------------------------------------------------- +DB_HOST=115.190.121.151 +DB_PORT=5432 +DB_USER=postgres +DB_PASSWORD=huang1998 +DB_NAME=langgraph_db +# 完整连接字符串(也支持直接配置,优先使用分离配置) +DB_URI=postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable + +# ----------------------------------------------------------------------------- +# Qdrant 向量数据库配置(URL + API密钥 配对) +# ----------------------------------------------------------------------------- +QDRANT_URL=http://115.190.121.151:6333 +QDRANT_API_KEY=huang1998 +QDRANT_COLLECTION_NAME=mem0_user_memories + +# ----------------------------------------------------------------------------- +# llama.cpp 服务配置(URL + API密钥 配对) +# ----------------------------------------------------------------------------- +# 主 LLM 服务 (Gemma-4-E2B GGUF) - 端口 8081 +VLLM_BASE_URL=http://host.docker.internal:18000/v1 + +# Embedding 服务 (embeddinggemma-300M GGUF) - 端口 8082 +LLAMACPP_EMBEDDING_URL=http://host.docker.internal:18001/v1 +# LLAMACPP_API_KEY=huang1998 (已在上面配置) + +# Reranker 服务 (bge-reranker-v2-m3) - 端口 8083 +LLAMACPP_RERANKER_URL=http://host.docker.internal:18002/v1 # ⭐ 日志调试配置(部署时可灵活调整) # ============================================================================= @@ -28,53 +59,17 @@ DEBUG=false # false: 关闭追踪,减少日志量 ENABLE_GRAPH_TRACE=false -# ----------------------------------------------------------------------------- -# llama.cpp 服务配置 -# ----------------------------------------------------------------------------- -# 主 LLM 服务 (Gemma-4-E2B GGUF) - 端口 8081 -VLLM_BASE_URL=http://host.docker.internal:18000/v1 - -# Embedding 服务 (embeddinggemma-300M GGUF) - 端口 8082 -LLAMACPP_EMBEDDING_URL=http://host.docker.internal:18001/v1 - -# Reranker 服务 (bge-reranker-v2-m3) - 端口 8083 -LLAMACPP_RERANKER_URL=http://host.docker.internal:18002/v1 - -# ----------------------------------------------------------------------------- -# Mem0 记忆层配置 -# ----------------------------------------------------------------------------- -# Qdrant 向量数据库(远程服务器上的独立容器) -QDRANT_URL=http://115.190.121.151:6333 -QDRANT_COLLECTION_NAME=mem0_user_memories - -# ----------------------------------------------------------------------------- -# 数据库配置 -# ----------------------------------------------------------------------------- -# PostgreSQL 连接字符串(远程服务器上的独立容器) -DB_URI=postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable - # ----------------------------------------------------------------------------- # 前端配置 # ----------------------------------------------------------------------------- # Docker Compose 内部网络,使用服务名 'backend' -API_URL=http://backend:8083/chat +API_URL=http://backend:8079/chat # ⭐ 前端通信地址(Docker 内部网络) # 注意:这里只需要域名和端口,不需要 /chat 路径 -- API_URL=http://backend:8083 +# API_URL=http://backend:8079 # ----------------------------------------------------------------------------- # 应用行为配置 # ----------------------------------------------------------------------------- MEMORY_SUMMARIZE_INTERVAL=10 - -# ----------------------------------------------------------------------------- -# unstructured 库 spaCy 模型配置 -# ----------------------------------------------------------------------------- -# 指定文档解析使用的语言: eng (英语) 或 zho (中文) -UNSTRUCTURED_LANGUAGE=zho - -# 指定 spaCy 模型名称(需与 UNSTRUCTURED_LANGUAGE 对应) -# eng -> en_core_web_sm -# zho -> zh_core_web_sm -SPACY_MODEL=zh_core_web_sm diff --git a/backend/app/config.py b/backend/app/config.py index 5b3ea11..b62f05e 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,50 +1,89 @@ """ 环境变量集中管理模块 所有配置项统一定义,避免散落在各个文件中 +配置分组:相关配置放在一起,URL 和 API Key 配对 +所有配置直接从环境变量读取,无默认值,避免配置混乱 +需要类型转换的配置在此处理 """ import os -# ========== Graph 执行追踪配置 ========== -# 是否启用 Graph 流转追踪(通过环境变量控制) -ENABLE_GRAPH_TRACE = os.getenv("ENABLE_GRAPH_TRACE", "true").lower() == "true" +# ========== 辅助函数:类型转换 ========== +def _get_str(key: str) -> str | None: + """获取字符串配置""" + return os.getenv(key) -# ========== 记忆提取配置 ========== -# 记忆提取间隔:每 N 轮对话生成一次摘要 -MEMORY_SUMMARIZE_INTERVAL = int(os.getenv("MEMORY_SUMMARIZE_INTERVAL", "10")) -# ========== Mem0 记忆层配置 ========== -# Qdrant 向量数据库地址 -QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") -QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "mem0_user_memories") -QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "your-qdrant-api-key") +def _get_int(key: str) -> int | None: + """获取整数配置,自动转换""" + value = os.getenv(key) + if value is not None: + try: + return int(value) + except (ValueError, TypeError): + pass + return None -# ========== llm 配置 ========== -# LLM 模型配置 -VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://127.0.0.1:8081/v1") -LLM_API_KEY = os.getenv("LLM_API_KEY", "your-ai-api-key") -# llama.cpp Embedding 服务地址 (用于 Mem0 的向量化) -LLAMACPP_EMBEDDING_URL = os.getenv("LLAMACPP_EMBEDDING_URL", "http://127.0.0.1:8082/v1") -LLAMACPP_API_KEY = os.getenv("LLAMACPP_API_KEY", "your-llamacpp-api-key") +def _get_bool(key: str) -> bool | None: + """获取布尔配置,自动转换""" + value = os.getenv(key) + if value is not None: + return value.lower() in ("true", "1", "yes", "on") + return None -# ========== 后端服务配置 ========== -# 数据库连接字符串 -DB_URI = os.getenv( - "DB_URI", - "postgresql://postgres:huang1998@ai-postgres:5432/langgraph_db?sslmode=disable" -) -# 后端服务端口 -BACKEND_PORT = int(os.getenv("BACKEND_PORT", "8079")) - -# ========== 日志配置 ========== -LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() -DEBUG = os.getenv("DEBUG", "false").lower() == "true" - -# ========== Reranker 服务配置 ========== -LLAMACPP_RERANKER_URL = os.getenv("LLAMACPP_RERANKER_URL", "http://127.0.0.1:8083") # ========== 第三方 API 密钥 ========== -ZHIPUAI_API_KEY = os.getenv("ZHIPUAI_API_KEY", "") -DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") +ZHIPUAI_API_KEY = _get_str("ZHIPUAI_API_KEY") +DEEPSEEK_API_KEY = _get_str("DEEPSEEK_API_KEY") + + +# ========== llama.cpp 服务配置(URL + API密钥 配对) ========== +# 主 LLM 服务 +VLLM_BASE_URL = _get_str("VLLM_BASE_URL") +LLM_API_KEY = _get_str("LLAMACPP_API_KEY") + +# Embedding 服务 (用于 Mem0 的向量化) +LLAMACPP_EMBEDDING_URL = _get_str("LLAMACPP_EMBEDDING_URL") +LLAMACPP_API_KEY = _get_str("LLAMACPP_API_KEY") + +# Reranker 服务 +LLAMACPP_RERANKER_URL = _get_str("LLAMACPP_RERANKER_URL") + + +# ========== Qdrant 向量数据库配置(URL + API密钥 配对) ========== +QDRANT_URL = _get_str("QDRANT_URL") +QDRANT_API_KEY = _get_str("QDRANT_API_KEY") +QDRANT_COLLECTION_NAME = _get_str("QDRANT_COLLECTION_NAME") + + +# ========== PostgreSQL 数据库配置(分离配置 + 完整URI) ========== +# 分离配置(优先使用) +DB_HOST = _get_str("DB_HOST") +DB_PORT = _get_int("DB_PORT") +DB_USER = _get_str("DB_USER") +DB_PASSWORD = _get_str("DB_PASSWORD") +DB_NAME = _get_str("DB_NAME") + +# 完整连接字符串(直接从环境变量读取) +DB_URI = _get_str("DB_URI") + + +# ========== 后端服务配置 ========== +BACKEND_PORT = _get_int("BACKEND_PORT") + + +# ========== Mem0 记忆层配置 ========== +# 记忆提取间隔:每 N 轮对话生成一次摘要 +MEMORY_SUMMARIZE_INTERVAL = _get_int("MEMORY_SUMMARIZE_INTERVAL") + + +# ========== Graph 执行追踪配置 ========== +# 是否启用 Graph 流转追踪(通过环境变量控制) +ENABLE_GRAPH_TRACE = _get_bool("ENABLE_GRAPH_TRACE") + + +# ========== 日志配置 ========== +LOG_LEVEL = _get_str("LOG_LEVEL") +DEBUG = _get_bool("DEBUG") diff --git a/backend/rag_core/config.py b/backend/rag_core/config.py index 0d06575..0d73cdc 100644 --- a/backend/rag_core/config.py +++ b/backend/rag_core/config.py @@ -1,24 +1,54 @@ """ RAG Core 配置管理模块 集中管理所有环境变量配置项,避免散落在各个文件中 +所有配置直接从环境变量读取,无默认值,避免配置混乱 +需要类型转换的配置在此处理 """ import os -# ========== 向量数据库配置 ========== -QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") -QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "") -# ========== 嵌入服务配置 ========== -LLAMACPP_EMBEDDING_URL = os.getenv("LLAMACPP_EMBEDDING_URL", "http://127.0.0.1:8082") -LLAMACPP_API_KEY = os.getenv("LLAMACPP_API_KEY", "") +# ========== 辅助函数:类型转换 ========== +def _get_str(key: str) -> str | None: + """获取字符串配置""" + return os.getenv(key) + + +def _get_int(key: str) -> int | None: + """获取整数配置,自动转换""" + value = os.getenv(key) + if value is not None: + try: + return int(value) + except (ValueError, TypeError): + pass + return None + + +# ========== 向量数据库配置(URL + API密钥 配对) ========== +QDRANT_URL = _get_str("QDRANT_URL") +QDRANT_API_KEY = _get_str("QDRANT_API_KEY") + + +# ========== 嵌入服务配置(URL + API密钥 配对) ========== +LLAMACPP_EMBEDDING_URL = _get_str("LLAMACPP_EMBEDDING_URL") +LLAMACPP_API_KEY = _get_str("LLAMACPP_API_KEY") + + +# ========== 文档存储配置(分离配置 + 完整URI) ========== +# 分离配置(优先使用) +DB_HOST = _get_str("DB_HOST") +DB_PORT = _get_int("DB_PORT") +DB_USER = _get_str("DB_USER") +DB_PASSWORD = _get_str("DB_PASSWORD") +DB_NAME = _get_str("DB_NAME") + +# 完整连接字符串(直接从环境变量读取) +DB_URI = _get_str("DB_URI") + +# 文档存储 URI(直接从环境变量读取,默认同 DB_URI) +DOCSTORE_URI = _get_str("DOCSTORE_URI") or DB_URI -# ========== 文档存储配置 ========== -DB_URI = os.getenv( - "DB_URI", - "postgresql://postgres:***@ai-postgres:5432/langgraph_db?sslmode=disable" -) -DOCSTORE_URI = os.getenv("DOCSTORE_URI", DB_URI) # ========== 其他配置 ========== -# 可以在此添加其他 RAG Core 专用的配置项 \ No newline at end of file +# 可以在此添加其他 RAG Core 专用的配置项 diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 0d14d0d..f665859 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /app # ============================================================================= # 非敏感环境变量(固化在镜像中,无需通过 .env 配置) # ============================================================================= -ENV PYTHONPATH=/app:/app/backend +ENV PYTHONPATH=/app # llama.cpp 服务配置(本地部署标准端口) ENV VLLM_BASE_URL=http://host.docker.internal:18000/v1 @@ -19,10 +19,6 @@ ENV QDRANT_COLLECTION_NAME=mem0_user_memories ENV MEMORY_SUMMARIZE_INTERVAL=10 ENV ENABLE_GRAPH_TRACE=false -# unstructured 库 spaCy 模型配置 -ENV UNSTRUCTURED_LANGUAGE=eng -ENV SPACY_MODEL=en_core_web_sm - # 日志配置 ENV LOG_LEVEL=WARNING ENV DEBUG=false diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 87a61ed..2c02265 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,7 +1,4 @@ services: - # ⭐ PostgreSQL 和 Qdrant 已迁移到远程服务器 (115.190.121.151) - # 不再需要在本地 Docker Compose 中运行这些服务 - backend: build: context: .. # 构建上下文为项目根目录 @@ -18,12 +15,18 @@ services: - DEBUG=${DEBUG:-false} - ENABLE_GRAPH_TRACE=${ENABLE_GRAPH_TRACE:-false} - # ⭐ 基础设施配置:固化在 compose 文件中 - # PostgreSQL 连接(远程服务器) - - DB_URI=postgresql://postgres:huang1998@115.190.121.151:5432/langgraph_db?sslmode=disable - - # Qdrant 向量数据库(远程服务器) + # ⭐ 基础设施配置:从 .env 读取敏感信息 + # PostgreSQL 连接(远程服务器)- 分离凭据配置 + - DB_HOST=115.190.121.151 + - DB_PORT=5432 + - DB_USER=postgres + - DB_PASSWORD=${DB_PASSWORD} + - DB_NAME=langgraph_db + + # Qdrant 向量数据库(远程服务器)- 配对配置 - QDRANT_URL=http://115.190.121.151:6333 + - QDRANT_API_KEY=${QDRANT_API_KEY} + - QDRANT_COLLECTION_NAME=mem0_user_memories # 前端通信地址(Docker 内部网络) - API_URL=http://backend:8079/chat diff --git a/frontend/src/config.py b/frontend/src/config.py index 7b34700..f9631d5 100644 --- a/frontend/src/config.py +++ b/frontend/src/config.py @@ -1,6 +1,7 @@ """ 前端配置管理模块 集中管理所有配置项,支持环境变量覆盖 +需要类型转换的配置在此处理 """ import os @@ -12,6 +13,31 @@ from dotenv import load_dotenv load_dotenv() +# ========== 辅助函数:类型转换 ========== +def _get_str(key: str) -> str | None: + """获取字符串配置""" + return os.getenv(key) + + +def _get_int(key: str, default: int = 0) -> int: + """获取整数配置,自动转换""" + value = os.getenv(key) + if value is not None: + try: + return int(value) + except (ValueError, TypeError): + pass + return default + + +def _get_bool(key: str, default: bool = False) -> bool: + """获取布尔配置,自动转换""" + value = os.getenv(key) + if value is not None: + return value.lower() in ("true", "1", "yes", "on") + return default + + @dataclass class FrontendConfig: """前端配置类 - 统一管理所有配置项""" @@ -19,51 +45,55 @@ class FrontendConfig: # ==================== API 配置 ==================== api_base: str = "" - # ==================== 页面配置 ==================== + # ==================== 页面配置(固定值,无需环境变量) ==================== page_title: str = "AI 个人助手" page_icon: str = "🤖" layout: str = "wide" - # ==================== 模型配置 ==================== - default_model: str = "local" # 更改为local作为默认模型 + # ==================== 模型配置(固定值,无需环境变量) ==================== + default_model: str = "local" model_options: Optional[dict] = None - # ==================== 用户配置 ==================== + # ==================== 用户配置(固定值,无需环境变量) ==================== default_user_id: str = "default_user" - # ==================== 历史记录配置 ==================== + # ==================== 历史记录配置(固定值,无需环境变量) ==================== history_limit: int = 50 summary_max_length: int = 30 - # ==================== 流式响应配置 ==================== + # ==================== 流式响应配置(固定值,无需环境变量) ==================== stream_timeout: int = 120 + # ==================== 日志配置 ==================== + log_level: str = "" + debug: bool = False + def __post_init__(self): """初始化后处理 - 设置默认值和加载环境变量""" if self.model_options is None: self.model_options = { - "local": "本地 llama.cpp(Gemma-4)", # 本地模型作为第一个 - "deepseek": "DeepSeek V3.2(在线)", # DeepSeek 作为中间 - "zhipu": "智谱 GLM-4.7-Flash(在线)" # GLM-4.7 作为最后一个 + "local": "本地 llama.cpp(Gemma-4)", + "deepseek": "DeepSeek V3.2(在线)", + "zhipu": "智谱 GLM-4.7-Flash(在线)" } - # 从环境变量加载配置 + # 从环境变量加载配置(优先级最高) self._load_from_env() def _load_from_env(self): - """从环境变量加载配置(优先级最高)""" + """从环境变量加载配置(仅加载必要的配置项)""" # API 地址(移除 /chat 后缀) - # 优先级:环境变量 API_URL > 默认值 - api_url = os.getenv("API_URL", "http://127.0.0.1:8079") - self.api_base = api_url.replace("/chat", "").rstrip("/") - + api_url = _get_str("API_URL") + if api_url: + self.api_base = api_url.replace("/chat", "").rstrip("/") + # 日志配置 - self.log_level = os.getenv("LOG_LEVEL", "INFO").upper() - self.debug = os.getenv("DEBUG", "false").lower() == "true" + log_level = _get_str("LOG_LEVEL") + if log_level: + self.log_level = log_level.upper() + + self.debug = _get_bool("DEBUG", False) - # 日志配置 - self.log_level = os.getenv("LOG_LEVEL", "INFO").upper() - self.debug = os.getenv("DEBUG", "false").lower() == "true" # 全局配置实例(单例模式) -config = FrontendConfig() \ No newline at end of file +config = FrontendConfig() diff --git a/rag_indexer/__init__.py b/rag_indexer/__init__.py index 2a0117f..4a1e2e3 100644 --- a/rag_indexer/__init__.py +++ b/rag_indexer/__init__.py @@ -26,8 +26,22 @@ Offline RAG Indexer module. from .index_builder import IndexBuilder, IndexBuilderConfig, DocstoreConfig from .loaders import DocumentLoader from .splitters import SplitterType, get_splitter +from .config import ( + QDRANT_URL, + QDRANT_API_KEY, + LLAMACPP_EMBEDDING_URL, + LLAMACPP_API_KEY, + DB_URI, + DOCSTORE_URI, + RAG_OCR_LANGUAGES, + RAG_DOC_LANGUAGES, +) # 从 rag_core 重新导出常用组件 +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + from rag_core import ( LlamaCppEmbedder, QdrantVectorStore, @@ -39,7 +53,7 @@ __version__ = "2.0.0" __all__ = [ # 核心构建器与配置 - "index_builder", + "IndexBuilder", "IndexBuilderConfig", "DocstoreConfig", @@ -50,6 +64,16 @@ __all__ = [ "SplitterType", "get_splitter", + # 配置 + "QDRANT_URL", + "QDRANT_API_KEY", + "LLAMACPP_EMBEDDING_URL", + "LLAMACPP_API_KEY", + "DB_URI", + "DOCSTORE_URI", + "RAG_OCR_LANGUAGES", + "RAG_DOC_LANGUAGES", + # 嵌入与向量存储 "LlamaCppEmbedder", "QdrantVectorStore", diff --git a/rag_indexer/cli.py b/rag_indexer/cli.py index e63d3e9..87cd56e 100755 --- a/rag_indexer/cli.py +++ b/rag_indexer/cli.py @@ -6,13 +6,24 @@ import asyncio import logging import sys from pathlib import Path +from dotenv import load_dotenv + +# 加载 .env 文件 +load_dotenv() # 添加项目根目录和 backend 目录到 Python 路径 sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) -from .index_builder import IndexBuilder, IndexBuilderConfig -from .splitters import SplitterType +# 导入方式:条件导入,支持作为脚本运行和作为包导入 +if __name__ == "__main__": + # 作为脚本直接运行时使用绝对导入 + from rag_indexer.index_builder import IndexBuilder, IndexBuilderConfig + from rag_indexer.splitters import SplitterType +else: + # 作为包导入时使用相对导入 + from .index_builder import IndexBuilder, IndexBuilderConfig + from .splitters import SplitterType logging.basicConfig( level=logging.INFO, diff --git a/rag_indexer/config.py b/rag_indexer/config.py index 218a431..ef89034 100644 --- a/rag_indexer/config.py +++ b/rag_indexer/config.py @@ -1,32 +1,71 @@ """ RAG Indexer 配置管理模块 集中管理所有环境变量配置项,避免散落在各个文件中 +所有配置直接从环境变量读取,无默认值,避免配置混乱 +需要类型转换的配置在此处理 """ import os -# 尝试从 rag_core 导入配置(如果可用) -try: - from rag_core.config import ( - QDRANT_URL, - QDRANT_API_KEY, - LLAMACPP_EMBEDDING_URL, - LLAMACPP_API_KEY, - DB_URI, - DOCSTORE_URI, - ) -except ImportError: - # 如果 rag_core 不可用,则直接读取环境变量 - QDRANT_URL = os.getenv("QDRANT_URL", "http://127.0.0.1:6333") - QDRANT_API_KEY = os.getenv("QDRANT_API_KEY", "") - LLAMACPP_EMBEDDING_URL = os.getenv("LLAMACPP_EMBEDDING_URL", "http://127.0.0.1:8082") - LLAMACPP_API_KEY = os.getenv("LLAMACPP_API_KEY", "") - DB_URI = os.getenv( - "DB_URI", - "postgresql://postgres:huang1998@ai-postgres:5432/langgraph_db?sslmode=disable" - ) - DOCSTORE_URI = os.getenv("DOCSTORE_URI", DB_URI) + +# ========== 辅助函数:类型转换 ========== +def _get_str(key: str) -> str | None: + """获取字符串配置""" + return os.getenv(key) + + +def _get_int(key: str) -> int | None: + """获取整数配置,自动转换""" + value = os.getenv(key) + if value is not None: + try: + return int(value) + except (ValueError, TypeError): + pass + return None + + +def _get_list_str(key: str, default: list[str] | None = None) -> list[str]: + """获取字符串列表配置,从逗号分隔的字符串解析""" + value = os.getenv(key) + if value is not None: + return [item.strip() for item in value.split(",") if item.strip()] + return default or [] + + +# ========== 向量数据库配置(URL + API密钥 配对) ========== +QDRANT_URL = _get_str("QDRANT_URL") +QDRANT_API_KEY = _get_str("QDRANT_API_KEY") + + +# ========== 嵌入服务配置(URL + API密钥 配对) ========== +LLAMACPP_EMBEDDING_URL = _get_str("LLAMACPP_EMBEDDING_URL") +LLAMACPP_API_KEY = _get_str("LLAMACPP_API_KEY") + + +# ========== 文档存储配置(分离配置 + 完整URI) ========== +# 分离配置(优先使用) +DB_HOST = _get_str("DB_HOST") +DB_PORT = _get_int("DB_PORT") +DB_USER = _get_str("DB_USER") +DB_PASSWORD = _get_str("DB_PASSWORD") +DB_NAME = _get_str("DB_NAME") + +# 完整连接字符串(直接从环境变量读取) +DB_URI = _get_str("DB_URI") + +# 文档存储 URI(直接从环境变量读取,默认同 DB_URI) +DOCSTORE_URI = _get_str("DOCSTORE_URI") or DB_URI + + +# ========== 文档加载器配置(unstructured 库) ========== +# OCR 语言列表(逗号分隔,如 "chi_sim,eng") +RAG_OCR_LANGUAGES = _get_list_str("RAG_OCR_LANGUAGES", ["chi_sim", "eng"]) + +# 文档主语言列表(逗号分隔,如 "zh") +RAG_DOC_LANGUAGES = _get_list_str("RAG_DOC_LANGUAGES", ["zh"]) + # ========== 索引器专用配置 ========== # 默认索引存储路径 -INDEX_STORAGE_PATH = os.getenv("INDEX_STORAGE_PATH", "./index_storage") \ No newline at end of file +INDEX_STORAGE_PATH = _get_str("INDEX_STORAGE_PATH") diff --git a/rag_indexer/index_builder.py b/rag_indexer/index_builder.py index 1be4633..587b1e4 100644 --- a/rag_indexer/index_builder.py +++ b/rag_indexer/index_builder.py @@ -23,6 +23,12 @@ from qdrant_client.http.exceptions import ResponseHandlingException from .loaders import DocumentLoader from .splitters import SplitterType, get_splitter + +# 从 rag_core 导入 +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + from rag_core import LlamaCppEmbedder, QdrantVectorStore, create_docstore, create_parent_retriever logger = logging.getLogger(__name__) diff --git a/rag_indexer/loaders.py b/rag_indexer/loaders.py index c5c6e33..3f59b0a 100644 --- a/rag_indexer/loaders.py +++ b/rag_indexer/loaders.py @@ -11,6 +11,9 @@ from langchain_core.documents import Document from unstructured.documents.elements import Element from unstructured.partition.auto import partition +# 相对导入配置 +from .config import RAG_OCR_LANGUAGES, RAG_DOC_LANGUAGES + logger = logging.getLogger(__name__) # 模块加载时设置一次环境变量,避免重复设置 @@ -47,8 +50,8 @@ class DocumentLoader: """ self.extract_images = extract_images self.strategy = strategy - self.ocr_languages = ocr_languages or ["chi_sim", "eng"] - self.languages = languages or ["zh"] + self.ocr_languages = ocr_languages or RAG_OCR_LANGUAGES + self.languages = languages or RAG_DOC_LANGUAGES self.include_page_breaks = include_page_breaks self.pdf_infer_table_structure = pdf_infer_table_structure self.partition_kwargs = partition_kwargs or {} diff --git a/test/test_frontend.py b/test/test_frontend.py new file mode 100644 index 0000000..855bbbd --- /dev/null +++ b/test/test_frontend.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +前端快速测试脚本 +验证前端导入是否正常工作 +""" + +import sys +import os + +# 添加必要的路径 +project_root = os.path.dirname(os.path.abspath(__file__)) +frontend_src = os.path.join(project_root, "frontend", "src") +backend_dir = os.path.join(project_root, "backend") + +sys.path.insert(0, project_root) +sys.path.insert(0, frontend_src) +sys.path.insert(0, backend_dir) + +print("=" * 60) +print("前端导入测试") +print("=" * 60) + +# 测试 1: 直接导入前端模块 +print("\n[测试 1] 直接导入前端模块...") +try: + from frontend.src.frontend_main import main + print("✅ frontend_main 导入成功") +except Exception as e: + print(f"❌ 导入失败: {e}") + sys.exit(1) + +# 测试 2: 导入配置 +print("\n[测试 2] 导入配置...") +try: + from config import config + print(f"✅ config 导入成功: page_title={config.page_title}") +except Exception as e: + print(f"❌ 导入失败: {e}") + +# 测试 3: 导入状态管理 +print("\n[测试 3] 导入状态管理...") +try: + from state import AppState + print("✅ AppState 导入成功") +except Exception as e: + print(f"❌ 导入失败: {e}") + +# 测试 4: 导入 API 客户端 +print("\n[测试 4] 导入 API 客户端...") +try: + from api_client import api_client + print("✅ api_client 导入成功") +except Exception as e: + print(f"❌ 导入失败: {e}") + +# 测试 5: 导入组件 +print("\n[测试 5] 导入组件...") +try: + from components.sidebar import render_sidebar + from components.chat_area import render_chat_area + from components.info_panel import render_info_panel + print("✅ 所有组件导入成功") +except Exception as e: + print(f"❌ 导入失败: {e}") + +print("\n" + "=" * 60) +print("🎉 所有前端导入测试通过!") +print("=" * 60) +print("\n现在可以使用 ./scripts/start.sh both 启动完整服务")