ailine/rag_indexer/loaders.py

"""
Document loaders using unstructured library.
"""

import logging
from pathlib import Path
from typing import List, Union

from langchain_core.documents import Document
from unstructured.partition.auto import partition

logger = logging.getLogger(__name__)


class DocumentLoader:
    """Load documents from various file formats."""

    SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx"}

    def __init__(self, extract_images: bool = False):
        """
        Args:
            extract_images: Whether to extract images from documents (requires additional dependencies)
        """
        self.extract_images = extract_images

    def load_file(self, file_path: Union[str, Path]) -> List[Document]:
        """Load a single file into LangChain Document objects."""
        file_path = Path(file_path).resolve()
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        suffix = file_path.suffix.lower()
        if suffix not in self.SUPPORTED_EXTENSIONS:
            raise ValueError(
                f"Unsupported file extension: {suffix}. Supported: {self.SUPPORTED_EXTENSIONS}"
            )

        # Parse with unstructured
        elements = partition(
            filename=str(file_path),
            extract_images_in_pdf=self.extract_images,
        )

        documents = []
        for elem in elements:
            text = getattr(elem, "text", "")
            if not text or not text.strip():
                continue

            # Base metadata
            metadata = {
                "source": str(file_path),
                "file_name": file_path.name,
                "file_type": suffix,
            }

            # Merge element-specific metadata without overwriting base fields
            elem_meta = getattr(elem, "metadata", {}) or {}
            for key, value in elem_meta.items():
                if value and key not in metadata:
                    metadata[key] = value

            documents.append(Document(page_content=text, metadata=metadata))

        if not documents:
            logger.warning("No text content extracted from %s", file_path)
            return []

        return documents

    def load_directory(
        self, directory_path: Union[str, Path], recursive: bool = True
    ) -> List[Document]:
        """Load all supported files from a directory."""
        directory_path = Path(directory_path).resolve()
        if not directory_path.is_dir():
            raise NotADirectoryError(f"Not a directory: {directory_path}")

        all_documents = []
        pattern = "**/*" if recursive else "*"

        for file_path in directory_path.glob(pattern):
            if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
                try:
                    docs = self.load_file(file_path)
                    all_documents.extend(docs)
                except Exception as e:
                    logger.error("Failed to load %s: %s", file_path, e)

        return all_documents