Files
ailine/rag_indexer/loaders.py

126 lines
4.5 KiB
Python
Raw Normal View History

2026-04-18 16:56:23 +08:00
"""
2026-04-19 15:01:40 +08:00
文档加载器使用 unstructured 库解析文档
2026-04-18 16:56:23 +08:00
"""
import logging
from pathlib import Path
2026-04-19 15:01:40 +08:00
from typing import Any, Dict, List, Mapping, Optional, Union
2026-04-18 16:56:23 +08:00
from langchain_core.documents import Document
from unstructured.partition.auto import partition
logger = logging.getLogger(__name__)
class DocumentLoader:
2026-04-19 15:01:40 +08:00
"""从各种文件格式加载文档。"""
SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".txt", ".md", ".html", ".pptx", ".xlsx", ".json"}
def __init__(
self,
extract_images: bool = False,
strategy: str = "auto",
ocr_languages: Optional[List[str]] = None,
languages: Optional[List[str]] = None,
include_page_breaks: bool = False,
pdf_infer_table_structure: bool = True,
partition_kwargs: Optional[Dict[str, Any]] = None,
):
2026-04-18 16:56:23 +08:00
"""
Args:
2026-04-19 15:01:40 +08:00
extract_images: 是否提取 PDF 中的图片
strategy: 解析策略 (auto, fast, hi_res, ocr_only)
ocr_languages: OCR 语言列表 ['chi_sim', 'eng']
languages: 文档主语言 ['zh']
include_page_breaks: 是否包含分页符
pdf_infer_table_structure: 是否识别表格结构 ( hi_res 策略)
partition_kwargs: 额外的 partition 参数字典高级定制
2026-04-18 16:56:23 +08:00
"""
2026-04-19 15:01:40 +08:00
import os
os.environ["UNSTRUCTURED_LANGUAGE_CHECKS"] = "false"
2026-04-18 16:56:23 +08:00
self.extract_images = extract_images
2026-04-19 15:01:40 +08:00
self.strategy = strategy
self.ocr_languages = ocr_languages or ["chi_sim", "eng"]
self.languages = languages or ["zh"]
self.include_page_breaks = include_page_breaks
self.pdf_infer_table_structure = pdf_infer_table_structure
self.partition_kwargs = partition_kwargs or {}
2026-04-18 16:56:23 +08:00
def load_file(self, file_path: Union[str, Path]) -> List[Document]:
2026-04-19 15:01:40 +08:00
"""将单个文件加载为 LangChain Document 对象。"""
2026-04-18 16:56:23 +08:00
file_path = Path(file_path).resolve()
if not file_path.exists():
2026-04-19 15:01:40 +08:00
raise FileNotFoundError(f"文件不存在: {file_path}")
2026-04-18 16:56:23 +08:00
suffix = file_path.suffix.lower()
if suffix not in self.SUPPORTED_EXTENSIONS:
raise ValueError(
2026-04-19 15:01:40 +08:00
f"不支持的文件扩展名: {suffix}。支持的格式: {self.SUPPORTED_EXTENSIONS}"
2026-04-18 16:56:23 +08:00
)
2026-04-19 15:01:40 +08:00
# 根据文件类型动态调整参数
extra_kwargs = {}
if suffix == ".pdf":
extra_kwargs["strategy"] = self.strategy
extra_kwargs["ocr_languages"] = self.ocr_languages
extra_kwargs["extract_images_in_pdf"] = self.extract_images
extra_kwargs["pdf_infer_table_structure"] = self.pdf_infer_table_structure
# languages 参数适用于所有文件类型
if self.languages:
extra_kwargs["languages"] = self.languages
extra_kwargs["include_page_breaks"] = self.include_page_breaks
# 合并用户自定义的额外参数(优先级最高)
extra_kwargs.update(self.partition_kwargs)
# 使用 unstructured 解析
2026-04-18 16:56:23 +08:00
elements = partition(
filename=str(file_path),
2026-04-19 15:01:40 +08:00
**extra_kwargs
2026-04-18 16:56:23 +08:00
)
documents = []
for elem in elements:
text = getattr(elem, "text", "")
if not text or not text.strip():
continue
2026-04-19 15:01:40 +08:00
# 基础元数据
2026-04-18 16:56:23 +08:00
metadata = {
"source": str(file_path),
"file_name": file_path.name,
"file_type": suffix,
}
2026-04-19 15:01:40 +08:00
2026-04-18 16:56:23 +08:00
documents.append(Document(page_content=text, metadata=metadata))
if not documents:
2026-04-19 15:01:40 +08:00
logger.warning("未从 %s 提取到文本内容", file_path)
2026-04-18 16:56:23 +08:00
return []
return documents
def load_directory(
self, directory_path: Union[str, Path], recursive: bool = True
) -> List[Document]:
2026-04-19 15:01:40 +08:00
"""从目录加载所有支持的文件。"""
2026-04-18 16:56:23 +08:00
directory_path = Path(directory_path).resolve()
if not directory_path.is_dir():
2026-04-19 15:01:40 +08:00
raise NotADirectoryError(f"不是目录: {directory_path}")
2026-04-18 16:56:23 +08:00
all_documents = []
pattern = "**/*" if recursive else "*"
for file_path in directory_path.glob(pattern):
if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
try:
docs = self.load_file(file_path)
all_documents.extend(docs)
except Exception as e:
2026-04-19 15:01:40 +08:00
logger.error("加载 %s 失败: %s", file_path, e)
2026-04-18 16:56:23 +08:00
return all_documents