mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(state): SQLite concurrency hardening + session transcript integrity (#3249)
* fix(session-db): survive CLI/gateway concurrent write contention Closes #3139 Three layered fixes for the scenario where CLI and gateway write to state.db concurrently, causing create_session() to fail with 'database is locked' and permanently disabling session_search on the gateway side. 1. Increase SQLite connection timeout: 10s -> 30s hermes_state.py: longer window for the WAL writer to finish a batch flush before the other process gives up entirely. 2. INSERT OR IGNORE in create_session hermes_state.py: prevents IntegrityError on duplicate session IDs (e.g. gateway restarts while CLI session is still alive). 3. Don't null out _session_db on create_session failure (main fix) run_agent.py: a transient lock at agent startup must not permanently disable session_search for the lifetime of that agent instance. _session_db now stays alive so subsequent flushes and searches work once the lock clears. 4. New ensure_session() helper + call it during flush hermes_state.py: INSERT OR IGNORE for a minimal session row. run_agent.py _flush_messages_to_session_db: calls ensure_session() before appending messages, so the FK constraint is satisfied even when create_session() failed at startup. No-op when the row exists. * fix(state): release lock between context queries in search_messages The context-window queries (one per FTS5 match) were running inside the same lock acquisition as the primary FTS5 query, holding the lock for O(N) sequential SQLite round-trips. Move per-match context fetches outside the outer lock block so each acquires the lock independently, keeping critical sections short and allowing other threads to interleave. * fix(session): prefer longer source in load_transcript to prevent legacy truncation When a long-lived session pre-dates SQLite storage (e.g. sessions created before the DB layer was introduced, or after a clean deployment that reset the DB), _flush_messages_to_session_db only writes the *new* messages from the current turn to SQLite — it skips messages already present in conversation_history, assuming they are already persisted. That assumption fails for legacy JSONL-only sessions: Turn N (first after DB migration): load_transcript(id) → SQLite: 0 → falls back to JSONL: 994 ✓ _flush_messages_to_session_db: skip first 994, write 2 new → SQLite: 2 Turn N+1: load_transcript(id) → SQLite: 2 → returns immediately ✗ Agent sees 2 messages of history instead of 996 The same pattern causes the reported symptom: session JSON truncated to 4 messages (_save_session_log writes agent.messages which only has 2 history + 2 new = 4). Fix: always load both sources and return whichever is longer. For a fully-migrated session SQLite will always be ≥ JSONL, so there is no regression. For a legacy session that hasn't been bootstrapped yet, JSONL wins and the full history is restored. Closes #3212 * test: add load_transcript source preference tests for #3212 Covers: JSONL longer returns JSONL, SQLite longer returns SQLite, SQLite empty falls back to JSONL, both empty returns empty, equal length prefers SQLite (richer reasoning fields). --------- Co-authored-by: Mibayy <mibayy@hermes.ai> Co-authored-by: kewe63 <kewe.3217@gmail.com> Co-authored-by: Mibayy <mibayy@users.noreply.github.com>
This commit is contained in:
parent
3a7907b278
commit
b81d49dc45
5 changed files with 247 additions and 33 deletions
|
|
@ -124,7 +124,10 @@ class SessionDB:
|
|||
self._conn = sqlite3.connect(
|
||||
str(self.db_path),
|
||||
check_same_thread=False,
|
||||
timeout=10.0,
|
||||
# 30s gives the WAL writer (CLI or gateway) time to finish a batch
|
||||
# flush before the concurrent reader/writer gives up. 10s was too
|
||||
# short when the CLI is doing frequent memory flushes.
|
||||
timeout=30.0,
|
||||
)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
|
|
@ -255,7 +258,7 @@ class SessionDB:
|
|||
"""Create a new session record. Returns the session_id."""
|
||||
with self._lock:
|
||||
self._conn.execute(
|
||||
"""INSERT INTO sessions (id, source, user_id, model, model_config,
|
||||
"""INSERT OR IGNORE INTO sessions (id, source, user_id, model, model_config,
|
||||
system_prompt, parent_session_id, started_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
|
|
@ -351,6 +354,27 @@ class SessionDB:
|
|||
)
|
||||
self._conn.commit()
|
||||
|
||||
def ensure_session(
|
||||
self,
|
||||
session_id: str,
|
||||
source: str = "unknown",
|
||||
model: str = None,
|
||||
) -> None:
|
||||
"""Ensure a session row exists, creating it with minimal metadata if absent.
|
||||
|
||||
Used by _flush_messages_to_session_db to recover from a failed
|
||||
create_session() call (e.g. transient SQLite lock at agent startup).
|
||||
INSERT OR IGNORE is safe to call even when the row already exists.
|
||||
"""
|
||||
with self._lock:
|
||||
self._conn.execute(
|
||||
"""INSERT OR IGNORE INTO sessions
|
||||
(id, source, model, started_at)
|
||||
VALUES (?, ?, ?, ?)""",
|
||||
(session_id, source, model, time.time()),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def get_session(self, session_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get a session by ID."""
|
||||
with self._lock:
|
||||
|
|
@ -862,9 +886,11 @@ class SessionDB:
|
|||
return []
|
||||
matches = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# Add surrounding context (1 message before + after each match)
|
||||
for match in matches:
|
||||
try:
|
||||
# Add surrounding context (1 message before + after each match).
|
||||
# Done outside the lock so we don't hold it across N sequential queries.
|
||||
for match in matches:
|
||||
try:
|
||||
with self._lock:
|
||||
ctx_cursor = self._conn.execute(
|
||||
"""SELECT role, content FROM messages
|
||||
WHERE session_id = ? AND id >= ? - 1 AND id <= ? + 1
|
||||
|
|
@ -875,9 +901,9 @@ class SessionDB:
|
|||
{"role": r["role"], "content": (r["content"] or "")[:200]}
|
||||
for r in ctx_cursor.fetchall()
|
||||
]
|
||||
match["context"] = context_msgs
|
||||
except Exception:
|
||||
match["context"] = []
|
||||
match["context"] = context_msgs
|
||||
except Exception:
|
||||
match["context"] = []
|
||||
|
||||
# Remove full content from result (snippet is enough, saves tokens)
|
||||
for match in matches:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue