diff --git a/hermes_state.py b/hermes_state.py index 0391047d055..709a8de86e0 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -54,7 +54,6 @@ SCHEMA_VERSION = 13 _WAL_INCOMPAT_MARKERS = ( "locking protocol", # SQLITE_PROTOCOL on NFS/SMB "not authorized", # Some FUSE mounts block WAL pragma outright - "disk i/o error", # Flaky network FS during WAL setup ) # Last SessionDB() init error, per-process. Surfaced in /resume and @@ -125,6 +124,27 @@ def format_session_db_unavailable(prefix: str = "Session database not available" return f"{prefix}: {cause}{hint}." +def _on_disk_journal_mode(conn: sqlite3.Connection) -> Optional[str]: + """Read the journal mode from the SQLite DB header on disk. + + Returns the mode string (e.g. ``"wal"``, ``"delete"``), or ``None`` + if the value cannot be determined (new DB, or PRAGMA read failed). + """ + try: + row = conn.execute("PRAGMA journal_mode").fetchone() + except sqlite3.OperationalError: + return None + if row is None: + return None + mode = row[0] + if isinstance(mode, bytes): # defensive: sqlite3 occasionally returns bytes + try: + mode = mode.decode("ascii") + except UnicodeDecodeError: + return None + return str(mode).strip().lower() if mode is not None else None + + def apply_wal_with_fallback( conn: sqlite3.Connection, *, @@ -147,6 +167,8 @@ def apply_wal_with_fallback( Shared by :class:`SessionDB` and ``hermes_cli.kanban_db.connect`` so both databases get identical fallback behavior. + + Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode. """ try: conn.execute("PRAGMA journal_mode=WAL") @@ -156,6 +178,10 @@ def apply_wal_with_fallback( if not any(marker in msg for marker in _WAL_INCOMPAT_MARKERS): # Unrelated OperationalError — don't silently swallow. raise + # Don't downgrade if another process already set WAL on disk. + existing = _on_disk_journal_mode(conn) + if existing == "wal": + raise _log_wal_fallback_once(db_label, exc) conn.execute("PRAGMA journal_mode=DELETE") return "delete" diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py index 5b645d318f9..0b8386bafe4 100644 --- a/tests/hermes_cli/test_kanban_core_functionality.py +++ b/tests/hermes_cli/test_kanban_core_functionality.py @@ -36,6 +36,15 @@ def kanban_home(tmp_path, monkeypatch): home.mkdir() monkeypatch.setenv("HERMES_HOME", str(home)) monkeypatch.setattr(Path, "home", lambda: tmp_path) + # Disable the detect_crashed_workers grace period for legacy tests in + # this file that claim a task and immediately expect + # ``detect_crashed_workers`` to act on it. The grace period (30s by + # default, see ``DEFAULT_CRASH_GRACE_SECONDS``) prevents the + # multi-dispatcher reap race in production; setting it to 0 here + # restores the pre-fix instant-reclaim semantics these tests were + # written against. The grace-period itself is covered by dedicated + # tests in tests/hermes_cli/test_kanban_db.py. + monkeypatch.setenv("HERMES_KANBAN_CRASH_GRACE_SECONDS", "0") kb.init_db() return home diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index c90fa4582b7..07e3b957e5f 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -2097,17 +2097,31 @@ def test_latest_summaries_batch_omits_tasks_without_summary(kanban_home): # NFS / network-filesystem fallback (see hermes_state.apply_wal_with_fallback) # --------------------------------------------------------------------------- -def test_connect_falls_back_to_delete_on_locking_protocol(kanban_home, caplog): +def test_connect_falls_back_to_delete_on_locking_protocol(tmp_path, monkeypatch, caplog): """kanban_db.connect() must handle ``locking protocol`` on NFS/SMB. Without this fallback, the gateway's kanban dispatcher crashes every 60s and the kanban migration (``consecutive_failures`` ADD COLUMN) is retried forever — which is what the real-world user report shows (see hermes-agent issue #22032). + + NOTE: We do NOT use the ``kanban_home`` fixture here because that + fixture pre-initializes the DB via ``kb.init_db()`` — putting the + file in WAL on disk. The Bug D safety guard now refuses to downgrade + to DELETE when the on-disk header is already WAL, so testing the + NFS-fallback path requires a truly-fresh DB file (NFS scenario in + production: first connection of the first process ever to touch the + file, where downgrading is safe because nobody else has WAL state + yet). """ import sqlite3 as _sqlite3 from unittest.mock import patch as _patch + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + monkeypatch.setattr(Path, "home", lambda: tmp_path) + # Clear module cache so a fresh connect() is attempted kb._INITIALIZED_PATHS.clear() diff --git a/tests/test_hermes_state_wal_fallback.py b/tests/test_hermes_state_wal_fallback.py index 05cee85012e..9dfc718acb1 100644 --- a/tests/test_hermes_state_wal_fallback.py +++ b/tests/test_hermes_state_wal_fallback.py @@ -110,15 +110,72 @@ class TestApplyWalWithFallback: assert mode == "delete" conn.close() - def test_falls_back_on_disk_io_error(self, tmp_path): - """Flaky network FS → disk I/O error → still fall back.""" + def test_reraises_on_disk_io_error(self, tmp_path): + """Transient EIO from ``PRAGMA journal_mode=WAL`` must NOT silently + downgrade to DELETE. + + Regression for "Bug D": treating transient EIO as a permanent + WAL-incompat marker produced the mixed-journal-mode-across-processes + corruption pattern (process A downgrades to DELETE, sibling + processes successfully set WAL, SQLite corrupts the file because + the two locking protocols are documented as incompatible). EIO is + usually transient (page-cache pressure, lock contention, brief + storage hiccups); the right behavior is to re-raise so the caller + can retry, not to walk the DB into a permanently downgraded state. + """ conn, _ = _open_blocking( tmp_path / "flaky.db", reason="disk I/O error", isolation_level=None ) - mode = apply_wal_with_fallback(conn) - assert mode == "delete" + with pytest.raises(sqlite3.OperationalError, match="disk I/O error"): + apply_wal_with_fallback(conn) conn.close() + def test_does_not_downgrade_when_disk_says_wal(self, tmp_path): + """Belt-and-suspenders: even if a marker matches, refuse to + downgrade when the on-disk DB header is already WAL. + + Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from + accidentally reintroducing the mixed-journal-mode corruption + pattern. We construct a DB already in WAL on disk, then open a + new connection whose ``PRAGMA journal_mode=WAL`` raises one of + the legit markers — the function must still re-raise (refusing + the downgrade) because the on-disk file is WAL. + """ + # Prime the file in WAL mode using a normal connection + primer = sqlite3.connect( + str(tmp_path / "already-wal.db"), isolation_level=None + ) + try: + primer.execute("PRAGMA journal_mode=WAL") + primer.execute("CREATE TABLE t (x INTEGER)") + primer.execute("INSERT INTO t VALUES (1)") + assert ( + primer.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal" + ) + finally: + primer.close() + + # New connection whose WAL pragma raises "locking protocol" — a + # marker that WOULD normally trigger downgrade. With the on-disk + # guard, we must instead re-raise. + conn, _ = _open_blocking( + tmp_path / "already-wal.db", + reason="locking protocol", + isolation_level=None, + ) + with pytest.raises(sqlite3.OperationalError, match="locking protocol"): + apply_wal_with_fallback(conn) + conn.close() + + # And the file is STILL WAL on disk — nothing got rewritten + check = sqlite3.connect(str(tmp_path / "already-wal.db")) + try: + assert ( + check.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal" + ) + finally: + check.close() + def test_reraises_unrelated_operational_error(self, tmp_path): """Non-WAL-compat errors must NOT be silently swallowed by the fallback.""" conn, _ = _open_blocking(