This commit is contained in:
142
rag_indexer/docstore_manager.py
Normal file
142
rag_indexer/docstore_manager.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Document store manager for ParentDocumentRetriever.
|
||||
|
||||
Supports both LocalFileStore (default) and custom PostgreSQL-backed stores.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
from langchain.storage import BaseStore, LocalFileStore
|
||||
|
||||
|
||||
def get_docstore(persist_path: str = None) -> LocalFileStore:
|
||||
"""
|
||||
Create and return a document store for parent chunks.
|
||||
|
||||
Args:
|
||||
persist_path: Path to store parent documents. Defaults to ./parent_docs
|
||||
or HERMES_HOME/parent_docs if set.
|
||||
"""
|
||||
if persist_path is None:
|
||||
# Use HERMES_HOME if available, otherwise default to current directory
|
||||
persist_path = os.getenv("HERMES_HOME")
|
||||
if persist_path:
|
||||
persist_path = os.path.join(persist_path, "parent_docs")
|
||||
else:
|
||||
persist_path = "./parent_docs"
|
||||
|
||||
os.makedirs(persist_path, exist_ok=True)
|
||||
return LocalFileStore(persist_path)
|
||||
|
||||
|
||||
class PostgresDocStore(BaseStore):
|
||||
"""
|
||||
PostgreSQL-backed document store for parent chunks.
|
||||
|
||||
This is an optional advanced feature. For most use cases,
|
||||
LocalFileStore is sufficient and simpler.
|
||||
"""
|
||||
|
||||
def __init__(self, connection_string: str):
|
||||
"""
|
||||
Initialize PostgreSQL document store.
|
||||
|
||||
Args:
|
||||
connection_string: PostgreSQL connection URL
|
||||
"""
|
||||
import psycopg2
|
||||
from psycopg2 import sql
|
||||
|
||||
self.conn_string = connection_string
|
||||
self._conn = None
|
||||
|
||||
# Create table if not exists
|
||||
self._create_table()
|
||||
|
||||
def _create_table(self):
|
||||
"""Create the parent documents table if not exists."""
|
||||
try:
|
||||
self._conn = psycopg2.connect(self.conn_string)
|
||||
cursor = self._conn.cursor()
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS parent_documents (
|
||||
key TEXT PRIMARY KEY,
|
||||
value JSONB NOT NULL,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
)
|
||||
""")
|
||||
self._conn.commit()
|
||||
cursor.close()
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to create PostgreSQL table: {e}")
|
||||
|
||||
def get(self, key: str) -> Optional[dict]:
|
||||
"""Retrieve a document by key."""
|
||||
try:
|
||||
self._ensure_connection()
|
||||
cursor = self._conn.cursor()
|
||||
cursor.execute("SELECT value FROM parent_documents WHERE key = %s", (key,))
|
||||
row = cursor.fetchone()
|
||||
cursor.close()
|
||||
if row:
|
||||
import json
|
||||
return json.loads(row[0])
|
||||
return None
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to retrieve document: {e}")
|
||||
|
||||
def set(self, key: str, value: dict) -> None:
|
||||
"""Store a document."""
|
||||
try:
|
||||
self._ensure_connection()
|
||||
cursor = self._conn.cursor()
|
||||
# Upsert
|
||||
insert_query = sql.SQL(
|
||||
"INSERT INTO parent_documents (key, value) VALUES (%s, %s)"
|
||||
)
|
||||
update_query = sql.SQL(
|
||||
"UPDATE parent_documents SET value = %s WHERE key = %s"
|
||||
)
|
||||
cursor.execute(insert_query, (key, json.dumps(value)))
|
||||
try:
|
||||
cursor.execute(update_query, (key, json.dumps(value)))
|
||||
except psycopg2.IntegrityError:
|
||||
pass # Key exists, ignore
|
||||
self._conn.commit()
|
||||
cursor.close()
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to store document: {e}")
|
||||
|
||||
def _ensure_connection(self):
|
||||
"""Ensure we have an open connection."""
|
||||
if self._conn is None or self._conn.closed:
|
||||
self._conn = psycopg2.connect(self.conn_string)
|
||||
|
||||
def close(self):
|
||||
"""Close the connection."""
|
||||
if self._conn and not self._conn.closed:
|
||||
self._conn.close()
|
||||
|
||||
|
||||
# Factory function for creating custom docstores
|
||||
# Returns a tuple: (BaseStore instance, connection_string or None)
|
||||
def create_docstore(
|
||||
store_type: str = "local",
|
||||
persist_path: str = None,
|
||||
connection_string: str = None
|
||||
) -> tuple:
|
||||
"""
|
||||
Factory function to create different types of document stores.
|
||||
|
||||
Args:
|
||||
store_type: "local" (default), "postgres"
|
||||
persist_path: Path for local file store
|
||||
connection_string: PostgreSQL connection string
|
||||
|
||||
Returns:
|
||||
Tuple of (BaseStore instance, connection_string or None)
|
||||
"""
|
||||
if store_type == "postgres" and connection_string:
|
||||
return (PostgresDocStore(connection_string), connection_string)
|
||||
else:
|
||||
return (get_docstore(persist_path), None)
|
||||
Reference in New Issue
Block a user