向量数据库

2026-04-18 16:56:23 +08:00
parent 0470afce13
commit c18e8a9860
11 changed files with 1121 additions and 0 deletions
--- a/rag_indexer/loaders.py
+++ b/rag_indexer/loaders.py
@@ -0,0 +1,91 @@
+"""
+Document loaders using unstructured library.
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Union
+
+from langchain_core.documents import Document
+from unstructured.partition.auto import partition
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentLoader:
+    """Load documents from various file formats."""
+
+    SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx"}
+
+    def __init__(self, extract_images: bool = False):
+        """
+        Args:
+            extract_images: Whether to extract images from documents (requires additional dependencies)
+        """
+        self.extract_images = extract_images
+
+    def load_file(self, file_path: Union[str, Path]) -> List[Document]:
+        """Load a single file into LangChain Document objects."""
+        file_path = Path(file_path).resolve()
+        if not file_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        suffix = file_path.suffix.lower()
+        if suffix not in self.SUPPORTED_EXTENSIONS:
+            raise ValueError(
+                f"Unsupported file extension: {suffix}. Supported: {self.SUPPORTED_EXTENSIONS}"
+            )
+
+        # Parse with unstructured
+        elements = partition(
+            filename=str(file_path),
+            extract_images_in_pdf=self.extract_images,
+        )
+
+        documents = []
+        for elem in elements:
+            text = getattr(elem, "text", "")
+            if not text or not text.strip():
+                continue
+
+            # Base metadata
+            metadata = {
+                "source": str(file_path),
+                "file_name": file_path.name,
+                "file_type": suffix,
+            }
+
+            # Merge element-specific metadata without overwriting base fields
+            elem_meta = getattr(elem, "metadata", {}) or {}
+            for key, value in elem_meta.items():
+                if value and key not in metadata:
+                    metadata[key] = value
+
+            documents.append(Document(page_content=text, metadata=metadata))
+
+        if not documents:
+            logger.warning("No text content extracted from %s", file_path)
+            return []
+
+        return documents
+
+    def load_directory(
+        self, directory_path: Union[str, Path], recursive: bool = True
+    ) -> List[Document]:
+        """Load all supported files from a directory."""
+        directory_path = Path(directory_path).resolve()
+        if not directory_path.is_dir():
+            raise NotADirectoryError(f"Not a directory: {directory_path}")
+
+        all_documents = []
+        pattern = "**/*" if recursive else "*"
+
+        for file_path in directory_path.glob(pattern):
+            if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
+                try:
+                    docs = self.load_file(file_path)
+                    all_documents.extend(docs)
+                except Exception as e:
+                    logger.error("Failed to load %s: %s", file_path, e)
+
+        return all_documents