91 lines
3.0 KiB
Python
91 lines
3.0 KiB
Python
"""
|
|
Document loaders using unstructured library.
|
|
"""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import List, Union
|
|
|
|
from langchain_core.documents import Document
|
|
from unstructured.partition.auto import partition
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentLoader:
|
|
"""Load documents from various file formats."""
|
|
|
|
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx"}
|
|
|
|
def __init__(self, extract_images: bool = False):
|
|
"""
|
|
Args:
|
|
extract_images: Whether to extract images from documents (requires additional dependencies)
|
|
"""
|
|
self.extract_images = extract_images
|
|
|
|
def load_file(self, file_path: Union[str, Path]) -> List[Document]:
|
|
"""Load a single file into LangChain Document objects."""
|
|
file_path = Path(file_path).resolve()
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
suffix = file_path.suffix.lower()
|
|
if suffix not in self.SUPPORTED_EXTENSIONS:
|
|
raise ValueError(
|
|
f"Unsupported file extension: {suffix}. Supported: {self.SUPPORTED_EXTENSIONS}"
|
|
)
|
|
|
|
# Parse with unstructured
|
|
elements = partition(
|
|
filename=str(file_path),
|
|
extract_images_in_pdf=self.extract_images,
|
|
)
|
|
|
|
documents = []
|
|
for elem in elements:
|
|
text = getattr(elem, "text", "")
|
|
if not text or not text.strip():
|
|
continue
|
|
|
|
# Base metadata
|
|
metadata = {
|
|
"source": str(file_path),
|
|
"file_name": file_path.name,
|
|
"file_type": suffix,
|
|
}
|
|
|
|
# Merge element-specific metadata without overwriting base fields
|
|
elem_meta = getattr(elem, "metadata", {}) or {}
|
|
for key, value in elem_meta.items():
|
|
if value and key not in metadata:
|
|
metadata[key] = value
|
|
|
|
documents.append(Document(page_content=text, metadata=metadata))
|
|
|
|
if not documents:
|
|
logger.warning("No text content extracted from %s", file_path)
|
|
return []
|
|
|
|
return documents
|
|
|
|
def load_directory(
|
|
self, directory_path: Union[str, Path], recursive: bool = True
|
|
) -> List[Document]:
|
|
"""Load all supported files from a directory."""
|
|
directory_path = Path(directory_path).resolve()
|
|
if not directory_path.is_dir():
|
|
raise NotADirectoryError(f"Not a directory: {directory_path}")
|
|
|
|
all_documents = []
|
|
pattern = "**/*" if recursive else "*"
|
|
|
|
for file_path in directory_path.glob(pattern):
|
|
if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
|
|
try:
|
|
docs = self.load_file(file_path)
|
|
all_documents.extend(docs)
|
|
except Exception as e:
|
|
logger.error("Failed to load %s: %s", file_path, e)
|
|
|
|
return all_documents |