检索器重构

2026-04-19 22:01:55 +08:00
parent cc8ef41ef9
commit 933d418d77
26 changed files with 1694 additions and 1717 deletions
--- a/rag_indexer/loaders.py
+++ b/rag_indexer/loaders.py
@@ -3,19 +3,27 @@
 """

 import logging
+import os
 from pathlib import Path
-from typing import Any, Dict, List, Mapping, Optional, Union
+from typing import Any, Dict, List, Optional, Union

 from langchain_core.documents import Document
+from unstructured.documents.elements import Element
 from unstructured.partition.auto import partition

 logger = logging.getLogger(__name__)

+# 模块加载时设置一次环境变量，避免重复设置
+os.environ.setdefault("UNSTRUCTURED_LANGUAGE_CHECKS", "false")
+

 class DocumentLoader:
    """从各种文件格式加载文档。"""

-    SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx", ".json"}
+    SUPPORTED_EXTENSIONS = {
+        ".pdf", ".docx", ".doc", ".txt", ".md",
+        ".html", ".pptx", ".xlsx", ".json"
+    }

    def __init__(
        self,
@@ -32,13 +40,11 @@ class DocumentLoader:
            extract_images: 是否提取 PDF 中的图片
            strategy: 解析策略 (auto, fast, hi_res, ocr_only)
            ocr_languages: OCR 语言列表，如 ['chi_sim', 'eng']
-            languages: 文档主语言，如 ['zh']
+            languages: 文档主语言，如 ['zh']（主要用于非 OCR 场景）
            include_page_breaks: 是否包含分页符
-            pdf_infer_table_structure: 是否识别表格结构 (需 hi_res 策略)
+            pdf_infer_table_structure: 是否识别表格结构（需 hi_res 策略）
            partition_kwargs: 额外的 partition 参数字典（高级定制）
        """
-        import os
-        os.environ["UNSTRUCTURED_LANGUAGE_CHECKS"] = "false"
        self.extract_images = extract_images
        self.strategy = strategy
        self.ocr_languages = ocr_languages or ["chi_sim", "eng"]
@@ -47,6 +53,52 @@ class DocumentLoader:
        self.pdf_infer_table_structure = pdf_infer_table_structure
        self.partition_kwargs = partition_kwargs or {}

+    def _build_partition_kwargs(self, file_path: Path) -> Dict[str, Any]:
+        """根据文件类型构建 partition 的参数。"""
+        kwargs: Dict[str, Any] = {
+            "include_page_breaks": self.include_page_breaks,
+        }
+
+        suffix = file_path.suffix.lower()
+
+        # PDF 专用参数
+        if suffix == ".pdf":
+            kwargs.update({
+                "strategy": self.strategy,
+                "ocr_languages": self.ocr_languages,
+                "extract_images_in_pdf": self.extract_images,
+                "pdf_infer_table_structure": self.pdf_infer_table_structure,
+            })
+
+        # 所有文件适用的语言参数
+        if self.languages:
+            kwargs["languages"] = self.languages
+
+        # 用户自定义参数覆盖默认值
+        kwargs.update(self.partition_kwargs)
+
+        return kwargs
+
+    def _element_to_document(self, element: Element, file_path: Path) -> Optional[Document]:
+        """将单个 Element 转换为 Document，同时保留关键元数据。"""
+        text = getattr(element, "text", "")
+        if not text or not text.strip():
+            return None
+
+        # 提取 unstructured 提供的元数据（根据实际需要选择）
+        metadata = {
+            "source": str(file_path),
+            "file_name": file_path.name,
+            "file_type": file_path.suffix.lower(),
+            # 以下元数据来自 Element 对象，可能为 None
+            "page_number": getattr(getattr(element, "metadata", None), "page_number", None),
+            "category": getattr(getattr(element, "metadata", None), "category", None),
+        }
+        # 过滤掉值为 None 的元数据
+        metadata = {k: v for k, v in metadata.items() if v is not None}
+
+        return Document(page_content=text, metadata=metadata)
+
    def load_file(self, file_path: Union[str, Path]) -> List[Document]:
        """将单个文件加载为 LangChain Document 对象。"""
        file_path = Path(file_path).resolve()
@@ -59,68 +111,58 @@ class DocumentLoader:
                f"不支持的文件扩展名: {suffix}。支持的格式: {self.SUPPORTED_EXTENSIONS}"
            )

-        # 根据文件类型动态调整参数
-        extra_kwargs = {}
-        if suffix == ".pdf":
-            extra_kwargs["strategy"] = self.strategy
-            extra_kwargs["ocr_languages"] = self.ocr_languages
-            extra_kwargs["extract_images_in_pdf"] = self.extract_images
-            extra_kwargs["pdf_infer_table_structure"] = self.pdf_infer_table_structure
-        
-        # languages 参数适用于所有文件类型
-        if self.languages:
-            extra_kwargs["languages"] = self.languages
-        
-        extra_kwargs["include_page_breaks"] = self.include_page_breaks
+        kwargs = self._build_partition_kwargs(file_path)

-        # 合并用户自定义的额外参数（优先级最高）
-        extra_kwargs.update(self.partition_kwargs)
-
-        # 使用 unstructured 解析
-        elements = partition(
-            filename=str(file_path),
-
-            **extra_kwargs
-        )
+        try:
+            elements = partition(filename=str(file_path), **kwargs)
+        except Exception as e:
+            logger.exception("解析文件 %s 失败", file_path)
+            raise RuntimeError(f"文件解析失败: {file_path}") from e

        documents = []
        for elem in elements:
-            text = getattr(elem, "text", "")
-            if not text or not text.strip():
-                continue
-
-            # 基础元数据
-            metadata = {
-                "source": str(file_path),
-                "file_name": file_path.name,
-                "file_type": suffix,
-            }
-            
-            documents.append(Document(page_content=text, metadata=metadata))
+            doc = self._element_to_document(elem, file_path)
+            if doc:
+                documents.append(doc)

        if not documents:
            logger.warning("未从 %s 提取到文本内容", file_path)
-            return []

        return documents

    def load_directory(
-        self, directory_path: Union[str, Path], recursive: bool = True
+        self,
+        directory_path: Union[str, Path],
+        recursive: bool = True,
+        fail_fast: bool = False
    ) -> List[Document]:
-        """从目录加载所有支持的文件。"""
+        """
+        从目录加载所有支持的文件。
+
+        Args:
+            directory_path: 目录路径
+            recursive: 是否递归子目录
+            fail_fast: 遇到第一个失败时是否立即抛出异常
+        """
        directory_path = Path(directory_path).resolve()
        if not directory_path.is_dir():
            raise NotADirectoryError(f"不是目录: {directory_path}")

-        all_documents = []
+        all_documents: List[Document] = []
        pattern = "**/*" if recursive else "*"

        for file_path in directory_path.glob(pattern):
-            if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
-                try:
-                    docs = self.load_file(file_path)
-                    all_documents.extend(docs)
-                except Exception as e:
-                    logger.error("加载 %s 失败: %s", file_path, e)
+            if not file_path.is_file():
+                continue
+            if file_path.suffix.lower() not in self.SUPPORTED_EXTENSIONS:
+                continue
+
+            try:
+                docs = self.load_file(file_path)
+                all_documents.extend(docs)
+            except Exception as e:
+                logger.error("加载 %s 失败: %s", file_path, e)
+                if fail_fast:
+                    raise

        return all_documents