mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(state): SQLite concurrency hardening + session transcript integrity (#3249)
* fix(session-db): survive CLI/gateway concurrent write contention Closes #3139 Three layered fixes for the scenario where CLI and gateway write to state.db concurrently, causing create_session() to fail with 'database is locked' and permanently disabling session_search on the gateway side. 1. Increase SQLite connection timeout: 10s -> 30s hermes_state.py: longer window for the WAL writer to finish a batch flush before the other process gives up entirely. 2. INSERT OR IGNORE in create_session hermes_state.py: prevents IntegrityError on duplicate session IDs (e.g. gateway restarts while CLI session is still alive). 3. Don't null out _session_db on create_session failure (main fix) run_agent.py: a transient lock at agent startup must not permanently disable session_search for the lifetime of that agent instance. _session_db now stays alive so subsequent flushes and searches work once the lock clears. 4. New ensure_session() helper + call it during flush hermes_state.py: INSERT OR IGNORE for a minimal session row. run_agent.py _flush_messages_to_session_db: calls ensure_session() before appending messages, so the FK constraint is satisfied even when create_session() failed at startup. No-op when the row exists. * fix(state): release lock between context queries in search_messages The context-window queries (one per FTS5 match) were running inside the same lock acquisition as the primary FTS5 query, holding the lock for O(N) sequential SQLite round-trips. Move per-match context fetches outside the outer lock block so each acquires the lock independently, keeping critical sections short and allowing other threads to interleave. * fix(session): prefer longer source in load_transcript to prevent legacy truncation When a long-lived session pre-dates SQLite storage (e.g. sessions created before the DB layer was introduced, or after a clean deployment that reset the DB), _flush_messages_to_session_db only writes the *new* messages from the current turn to SQLite — it skips messages already present in conversation_history, assuming they are already persisted. That assumption fails for legacy JSONL-only sessions: Turn N (first after DB migration): load_transcript(id) → SQLite: 0 → falls back to JSONL: 994 ✓ _flush_messages_to_session_db: skip first 994, write 2 new → SQLite: 2 Turn N+1: load_transcript(id) → SQLite: 2 → returns immediately ✗ Agent sees 2 messages of history instead of 996 The same pattern causes the reported symptom: session JSON truncated to 4 messages (_save_session_log writes agent.messages which only has 2 history + 2 new = 4). Fix: always load both sources and return whichever is longer. For a fully-migrated session SQLite will always be ≥ JSONL, so there is no regression. For a legacy session that hasn't been bootstrapped yet, JSONL wins and the full history is restored. Closes #3212 * test: add load_transcript source preference tests for #3212 Covers: JSONL longer returns JSONL, SQLite longer returns SQLite, SQLite empty falls back to JSONL, both empty returns empty, equal length prefers SQLite (richer reasoning fields). --------- Co-authored-by: Mibayy <mibayy@hermes.ai> Co-authored-by: kewe63 <kewe.3217@gmail.com> Co-authored-by: Mibayy <mibayy@users.noreply.github.com>
This commit is contained in:
parent
3a7907b278
commit
b81d49dc45
5 changed files with 247 additions and 33 deletions
|
|
@ -1116,3 +1116,66 @@ class TestResolveSessionByNameOrId:
|
|||
db.set_session_title("s1", "my project")
|
||||
result = db.resolve_session_by_title("my project")
|
||||
assert result == "s1"
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Concurrent write safety / lock contention fixes (#3139)
|
||||
# =========================================================================
|
||||
|
||||
class TestConcurrentWriteSafety:
|
||||
def test_create_session_insert_or_ignore_is_idempotent(self, db):
|
||||
"""create_session with the same ID twice must not raise (INSERT OR IGNORE)."""
|
||||
db.create_session(session_id="dup-1", source="cli", model="m")
|
||||
# Second call should be silent — no IntegrityError
|
||||
db.create_session(session_id="dup-1", source="gateway", model="m2")
|
||||
session = db.get_session("dup-1")
|
||||
# Row should exist (first write wins with OR IGNORE)
|
||||
assert session is not None
|
||||
assert session["source"] == "cli"
|
||||
|
||||
def test_ensure_session_creates_missing_row(self, db):
|
||||
"""ensure_session must create a minimal row when the session doesn't exist."""
|
||||
assert db.get_session("orphan-session") is None
|
||||
db.ensure_session("orphan-session", source="gateway", model="test-model")
|
||||
row = db.get_session("orphan-session")
|
||||
assert row is not None
|
||||
assert row["source"] == "gateway"
|
||||
assert row["model"] == "test-model"
|
||||
|
||||
def test_ensure_session_is_idempotent(self, db):
|
||||
"""ensure_session on an existing row must be a no-op (no overwrite)."""
|
||||
db.create_session(session_id="existing", source="cli", model="original-model")
|
||||
db.ensure_session("existing", source="gateway", model="overwrite-model")
|
||||
row = db.get_session("existing")
|
||||
# First write wins — ensure_session must not overwrite
|
||||
assert row["source"] == "cli"
|
||||
assert row["model"] == "original-model"
|
||||
|
||||
def test_ensure_session_allows_append_message_after_failed_create(self, db):
|
||||
"""Messages can be flushed even when create_session failed at startup.
|
||||
|
||||
Simulates the #3139 scenario: create_session raises (lock), then
|
||||
ensure_session is called during flush, then append_message succeeds.
|
||||
"""
|
||||
# Simulate failed create_session — row absent
|
||||
db.ensure_session("late-session", source="gateway", model="gpt-4")
|
||||
db.append_message(
|
||||
session_id="late-session",
|
||||
role="user",
|
||||
content="hello after lock",
|
||||
)
|
||||
msgs = db.get_messages("late-session")
|
||||
assert len(msgs) == 1
|
||||
assert msgs[0]["content"] == "hello after lock"
|
||||
|
||||
def test_sqlite_timeout_is_at_least_30s(self, db):
|
||||
"""Connection timeout should be >= 30s to survive CLI/gateway contention."""
|
||||
# Access the underlying connection timeout via sqlite3 introspection.
|
||||
# There is no public API, so we check the kwarg via the module default.
|
||||
import sqlite3
|
||||
import inspect
|
||||
from hermes_state import SessionDB as _SessionDB
|
||||
src = inspect.getsource(_SessionDB.__init__)
|
||||
assert "30" in src, (
|
||||
"SQLite timeout should be at least 30s to handle CLI/gateway lock contention"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue