mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-20 05:01:30 +00:00
fix(sqlite): fall back to journal_mode=DELETE on NFS/SMB/FUSE (#22043)
SQLite's WAL mode requires shared-memory (mmap) coordination and fcntl byte-range locks that don't reliably work on network filesystems. Upstream documents this explicitly: https://www.sqlite.org/wal.html#sometimes_queries_return_sqlite_busy_in_wal_mode On NFS / SMB / some FUSE mounts / WSL1, 'PRAGMA journal_mode=WAL' raises 'sqlite3.OperationalError: locking protocol' (SQLITE_PROTOCOL). Before this change, every feature backed by state.db or kanban.db broke silently: - /resume, /title, /history, /branch returned 'Session database not available.' with no cause - gateway logged the init failure at DEBUG (invisible in errors.log) - kanban dispatcher crashed every 60s, driving the known migration race (duplicate column name: consecutive_failures, #21708 / #21374) Changes: - hermes_state.apply_wal_with_fallback(): shared helper that tries WAL and falls back to DELETE on SQLITE_PROTOCOL-style errors with one WARNING explaining why - hermes_state.get_last_init_error() + format_session_db_unavailable(): capture the init failure cause and surface it in user-facing strings (with an NFS/SMB pointer for 'locking protocol') - hermes_cli/kanban_db.connect(): use the shared helper - gateway/run.py: bump SessionDB init failure log DEBUG -> WARNING (matches cli.py's existing correct behavior) - cli.py (4 sites) + gateway/run.py (5 sites): replace bare 'Session database not available.' with format_session_db_unavailable() Tests: 12 new tests in tests/test_hermes_state_wal_fallback.py + 1 new test in tests/hermes_cli/test_kanban_db.py. Existing suites (state, kanban, gateway, cli) remain green for all tests unrelated to pre-existing failures on main. Evidence: real-world user on NFSv3 mount (172.26.224.200:d2dfac12/home, local_lock=none) reporting 'Session database not available.' on /resume; 'locking protocol' appears in 4 distinct log entries across backup, kanban, TUI, and CLI paths in the same session. closes #22032
This commit is contained in:
parent
ae005ec588
commit
2a7047c2ed
10 changed files with 584 additions and 32 deletions
|
|
@ -914,3 +914,55 @@ def test_latest_summaries_batch_omits_tasks_without_summary(kanban_home):
|
|||
assert out == {t1: "alpha", t3: "charlie"}
|
||||
# Empty input → empty dict, no SQL syntax error from "IN ()".
|
||||
assert kb.latest_summaries(conn, []) == {}
|
||||
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# NFS / network-filesystem fallback (see hermes_state.apply_wal_with_fallback)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_connect_falls_back_to_delete_on_locking_protocol(kanban_home, caplog):
|
||||
"""kanban_db.connect() must handle ``locking protocol`` on NFS/SMB.
|
||||
|
||||
Without this fallback, the gateway's kanban dispatcher crashes every
|
||||
60s and the kanban migration (``consecutive_failures`` ADD COLUMN) is
|
||||
retried forever — which is what the real-world user report shows
|
||||
(see hermes-agent issue #22032).
|
||||
"""
|
||||
import sqlite3 as _sqlite3
|
||||
from unittest.mock import patch as _patch
|
||||
|
||||
# Clear module cache so a fresh connect() is attempted
|
||||
kb._INITIALIZED_PATHS.clear()
|
||||
|
||||
real_connect = _sqlite3.connect
|
||||
|
||||
class _WalBlockingConnection(_sqlite3.Connection):
|
||||
def execute(self, sql, *args, **kwargs): # type: ignore[override]
|
||||
if "journal_mode=wal" in sql.lower().replace(" ", ""):
|
||||
raise _sqlite3.OperationalError("locking protocol")
|
||||
return super().execute(sql, *args, **kwargs)
|
||||
|
||||
def wal_blocking_connect(*args, **kwargs):
|
||||
return real_connect(
|
||||
*args, factory=_WalBlockingConnection, **kwargs
|
||||
)
|
||||
|
||||
with _patch("hermes_cli.kanban_db.sqlite3.connect", side_effect=wal_blocking_connect):
|
||||
with caplog.at_level("WARNING", logger="hermes_state"):
|
||||
conn = kb.connect()
|
||||
|
||||
# One fallback warning, naming kanban.db
|
||||
warnings = [
|
||||
r for r in caplog.records
|
||||
if r.levelname == "WARNING" and "kanban.db" in r.getMessage()
|
||||
]
|
||||
assert len(warnings) >= 1, (
|
||||
f"Expected a kanban.db WARNING, got: {[r.getMessage() for r in caplog.records]}"
|
||||
)
|
||||
|
||||
# DB still usable end-to-end — create + list a task
|
||||
t = kb.create_task(conn, title="post-fallback task")
|
||||
tasks = kb.list_tasks(conn)
|
||||
assert any(row.id == t for row in tasks)
|
||||
conn.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue