143 lines
4.6 KiB
Python
143 lines
4.6 KiB
Python
"""
|
|
Document store manager for ParentDocumentRetriever.
|
|
|
|
Supports both LocalFileStore (default) and custom PostgreSQL-backed stores.
|
|
"""
|
|
|
|
import os
|
|
from typing import Optional
|
|
from langchain.storage import BaseStore, LocalFileStore
|
|
|
|
|
|
def get_docstore(persist_path: str = None) -> LocalFileStore:
|
|
"""
|
|
Create and return a document store for parent chunks.
|
|
|
|
Args:
|
|
persist_path: Path to store parent documents. Defaults to ./parent_docs
|
|
or HERMES_HOME/parent_docs if set.
|
|
"""
|
|
if persist_path is None:
|
|
# Use HERMES_HOME if available, otherwise default to current directory
|
|
persist_path = os.getenv("HERMES_HOME")
|
|
if persist_path:
|
|
persist_path = os.path.join(persist_path, "parent_docs")
|
|
else:
|
|
persist_path = "./parent_docs"
|
|
|
|
os.makedirs(persist_path, exist_ok=True)
|
|
return LocalFileStore(persist_path)
|
|
|
|
|
|
class PostgresDocStore(BaseStore):
|
|
"""
|
|
PostgreSQL-backed document store for parent chunks.
|
|
|
|
This is an optional advanced feature. For most use cases,
|
|
LocalFileStore is sufficient and simpler.
|
|
"""
|
|
|
|
def __init__(self, connection_string: str):
|
|
"""
|
|
Initialize PostgreSQL document store.
|
|
|
|
Args:
|
|
connection_string: PostgreSQL connection URL
|
|
"""
|
|
import psycopg2
|
|
from psycopg2 import sql
|
|
|
|
self.conn_string = connection_string
|
|
self._conn = None
|
|
|
|
# Create table if not exists
|
|
self._create_table()
|
|
|
|
def _create_table(self):
|
|
"""Create the parent documents table if not exists."""
|
|
try:
|
|
self._conn = psycopg2.connect(self.conn_string)
|
|
cursor = self._conn.cursor()
|
|
cursor.execute("""
|
|
CREATE TABLE IF NOT EXISTS parent_documents (
|
|
key TEXT PRIMARY KEY,
|
|
value JSONB NOT NULL,
|
|
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
)
|
|
""")
|
|
self._conn.commit()
|
|
cursor.close()
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to create PostgreSQL table: {e}")
|
|
|
|
def get(self, key: str) -> Optional[dict]:
|
|
"""Retrieve a document by key."""
|
|
try:
|
|
self._ensure_connection()
|
|
cursor = self._conn.cursor()
|
|
cursor.execute("SELECT value FROM parent_documents WHERE key = %s", (key,))
|
|
row = cursor.fetchone()
|
|
cursor.close()
|
|
if row:
|
|
import json
|
|
return json.loads(row[0])
|
|
return None
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to retrieve document: {e}")
|
|
|
|
def set(self, key: str, value: dict) -> None:
|
|
"""Store a document."""
|
|
try:
|
|
self._ensure_connection()
|
|
cursor = self._conn.cursor()
|
|
# Upsert
|
|
insert_query = sql.SQL(
|
|
"INSERT INTO parent_documents (key, value) VALUES (%s, %s)"
|
|
)
|
|
update_query = sql.SQL(
|
|
"UPDATE parent_documents SET value = %s WHERE key = %s"
|
|
)
|
|
cursor.execute(insert_query, (key, json.dumps(value)))
|
|
try:
|
|
cursor.execute(update_query, (key, json.dumps(value)))
|
|
except psycopg2.IntegrityError:
|
|
pass # Key exists, ignore
|
|
self._conn.commit()
|
|
cursor.close()
|
|
except Exception as e:
|
|
raise RuntimeError(f"Failed to store document: {e}")
|
|
|
|
def _ensure_connection(self):
|
|
"""Ensure we have an open connection."""
|
|
if self._conn is None or self._conn.closed:
|
|
self._conn = psycopg2.connect(self.conn_string)
|
|
|
|
def close(self):
|
|
"""Close the connection."""
|
|
if self._conn and not self._conn.closed:
|
|
self._conn.close()
|
|
|
|
|
|
# Factory function for creating custom docstores
|
|
# Returns a tuple: (BaseStore instance, connection_string or None)
|
|
def create_docstore(
|
|
store_type: str = "local",
|
|
persist_path: str = None,
|
|
connection_string: str = None
|
|
) -> tuple:
|
|
"""
|
|
Factory function to create different types of document stores.
|
|
|
|
Args:
|
|
store_type: "local" (default), "postgres"
|
|
persist_path: Path for local file store
|
|
connection_string: PostgreSQL connection string
|
|
|
|
Returns:
|
|
Tuple of (BaseStore instance, connection_string or None)
|
|
"""
|
|
if store_type == "postgres" and connection_string:
|
|
return (PostgresDocStore(connection_string), connection_string)
|
|
else:
|
|
return (get_docstore(persist_path), None)
|