""" Document loaders using unstructured library. """ import logging from pathlib import Path from typing import List, Union from langchain_core.documents import Document from unstructured.partition.auto import partition logger = logging.getLogger(__name__) class DocumentLoader: """Load documents from various file formats.""" SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx"} def __init__(self, extract_images: bool = False): """ Args: extract_images: Whether to extract images from documents (requires additional dependencies) """ self.extract_images = extract_images def load_file(self, file_path: Union[str, Path]) -> List[Document]: """Load a single file into LangChain Document objects.""" file_path = Path(file_path).resolve() if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") suffix = file_path.suffix.lower() if suffix not in self.SUPPORTED_EXTENSIONS: raise ValueError( f"Unsupported file extension: {suffix}. Supported: {self.SUPPORTED_EXTENSIONS}" ) # Parse with unstructured elements = partition( filename=str(file_path), extract_images_in_pdf=self.extract_images, ) documents = [] for elem in elements: text = getattr(elem, "text", "") if not text or not text.strip(): continue # Base metadata metadata = { "source": str(file_path), "file_name": file_path.name, "file_type": suffix, } # Merge element-specific metadata without overwriting base fields elem_meta = getattr(elem, "metadata", {}) or {} for key, value in elem_meta.items(): if value and key not in metadata: metadata[key] = value documents.append(Document(page_content=text, metadata=metadata)) if not documents: logger.warning("No text content extracted from %s", file_path) return [] return documents def load_directory( self, directory_path: Union[str, Path], recursive: bool = True ) -> List[Document]: """Load all supported files from a directory.""" directory_path = Path(directory_path).resolve() if not directory_path.is_dir(): raise NotADirectoryError(f"Not a directory: {directory_path}") all_documents = [] pattern = "**/*" if recursive else "*" for file_path in directory_path.glob(pattern): if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS: try: docs = self.load_file(file_path) all_documents.extend(docs) except Exception as e: logger.error("Failed to load %s: %s", file_path, e) return all_documents