diff --git a/hermes_state.py b/hermes_state.py
index 0391047d055..709a8de86e0 100644
--- a/hermes_state.py
+++ b/hermes_state.py
@@ -54,7 +54,6 @@ SCHEMA_VERSION = 13
 _WAL_INCOMPAT_MARKERS = (
     "locking protocol",       # SQLITE_PROTOCOL on NFS/SMB
     "not authorized",         # Some FUSE mounts block WAL pragma outright
-    "disk i/o error",         # Flaky network FS during WAL setup
 )
 
 # Last SessionDB() init error, per-process.  Surfaced in /resume and
@@ -125,6 +124,27 @@ def format_session_db_unavailable(prefix: str = "Session database not available"
     return f"{prefix}: {cause}{hint}."
 
 
+def _on_disk_journal_mode(conn: sqlite3.Connection) -> Optional[str]:
+    """Read the journal mode from the SQLite DB header on disk.
+
+    Returns the mode string (e.g. ``"wal"``, ``"delete"``), or ``None``
+    if the value cannot be determined (new DB, or PRAGMA read failed).
+    """
+    try:
+        row = conn.execute("PRAGMA journal_mode").fetchone()
+    except sqlite3.OperationalError:
+        return None
+    if row is None:
+        return None
+    mode = row[0]
+    if isinstance(mode, bytes):  # defensive: sqlite3 occasionally returns bytes
+        try:
+            mode = mode.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+    return str(mode).strip().lower() if mode is not None else None
+
+
 def apply_wal_with_fallback(
     conn: sqlite3.Connection,
     *,
@@ -147,6 +167,8 @@ def apply_wal_with_fallback(
 
     Shared by :class:`SessionDB` and ``hermes_cli.kanban_db.connect`` so
     both databases get identical fallback behavior.
+
+    Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode.
     """
     try:
         conn.execute("PRAGMA journal_mode=WAL")
@@ -156,6 +178,10 @@ def apply_wal_with_fallback(
         if not any(marker in msg for marker in _WAL_INCOMPAT_MARKERS):
             # Unrelated OperationalError — don't silently swallow.
             raise
+        # Don't downgrade if another process already set WAL on disk.
+        existing = _on_disk_journal_mode(conn)
+        if existing == "wal":
+            raise
         _log_wal_fallback_once(db_label, exc)
         conn.execute("PRAGMA journal_mode=DELETE")
         return "delete"
diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py
index 5b645d318f9..0b8386bafe4 100644
--- a/tests/hermes_cli/test_kanban_core_functionality.py
+++ b/tests/hermes_cli/test_kanban_core_functionality.py
@@ -36,6 +36,15 @@ def kanban_home(tmp_path, monkeypatch):
     home.mkdir()
     monkeypatch.setenv("HERMES_HOME", str(home))
     monkeypatch.setattr(Path, "home", lambda: tmp_path)
+    # Disable the detect_crashed_workers grace period for legacy tests in
+    # this file that claim a task and immediately expect
+    # ``detect_crashed_workers`` to act on it. The grace period (30s by
+    # default, see ``DEFAULT_CRASH_GRACE_SECONDS``) prevents the
+    # multi-dispatcher reap race in production; setting it to 0 here
+    # restores the pre-fix instant-reclaim semantics these tests were
+    # written against. The grace-period itself is covered by dedicated
+    # tests in tests/hermes_cli/test_kanban_db.py.
+    monkeypatch.setenv("HERMES_KANBAN_CRASH_GRACE_SECONDS", "0")
     kb.init_db()
     return home
 
diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py
index c90fa4582b7..07e3b957e5f 100644
--- a/tests/hermes_cli/test_kanban_db.py
+++ b/tests/hermes_cli/test_kanban_db.py
@@ -2097,17 +2097,31 @@ def test_latest_summaries_batch_omits_tasks_without_summary(kanban_home):
 # NFS / network-filesystem fallback (see hermes_state.apply_wal_with_fallback)
 # ---------------------------------------------------------------------------
 
-def test_connect_falls_back_to_delete_on_locking_protocol(kanban_home, caplog):
+def test_connect_falls_back_to_delete_on_locking_protocol(tmp_path, monkeypatch, caplog):
     """kanban_db.connect() must handle ``locking protocol`` on NFS/SMB.
 
     Without this fallback, the gateway's kanban dispatcher crashes every
     60s and the kanban migration (``consecutive_failures`` ADD COLUMN) is
     retried forever — which is what the real-world user report shows
     (see hermes-agent issue #22032).
+
+    NOTE: We do NOT use the ``kanban_home`` fixture here because that
+    fixture pre-initializes the DB via ``kb.init_db()`` — putting the
+    file in WAL on disk. The Bug D safety guard now refuses to downgrade
+    to DELETE when the on-disk header is already WAL, so testing the
+    NFS-fallback path requires a truly-fresh DB file (NFS scenario in
+    production: first connection of the first process ever to touch the
+    file, where downgrading is safe because nobody else has WAL state
+    yet).
     """
     import sqlite3 as _sqlite3
     from unittest.mock import patch as _patch
 
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path)
+
     # Clear module cache so a fresh connect() is attempted
     kb._INITIALIZED_PATHS.clear()
 
diff --git a/tests/test_hermes_state_wal_fallback.py b/tests/test_hermes_state_wal_fallback.py
index 05cee85012e..9dfc718acb1 100644
--- a/tests/test_hermes_state_wal_fallback.py
+++ b/tests/test_hermes_state_wal_fallback.py
@@ -110,15 +110,72 @@ class TestApplyWalWithFallback:
         assert mode == "delete"
         conn.close()
 
-    def test_falls_back_on_disk_io_error(self, tmp_path):
-        """Flaky network FS → disk I/O error → still fall back."""
+    def test_reraises_on_disk_io_error(self, tmp_path):
+        """Transient EIO from ``PRAGMA journal_mode=WAL`` must NOT silently
+        downgrade to DELETE.
+
+        Regression for "Bug D": treating transient EIO as a permanent
+        WAL-incompat marker produced the mixed-journal-mode-across-processes
+        corruption pattern (process A downgrades to DELETE, sibling
+        processes successfully set WAL, SQLite corrupts the file because
+        the two locking protocols are documented as incompatible). EIO is
+        usually transient (page-cache pressure, lock contention, brief
+        storage hiccups); the right behavior is to re-raise so the caller
+        can retry, not to walk the DB into a permanently downgraded state.
+        """
         conn, _ = _open_blocking(
             tmp_path / "flaky.db", reason="disk I/O error", isolation_level=None
         )
-        mode = apply_wal_with_fallback(conn)
-        assert mode == "delete"
+        with pytest.raises(sqlite3.OperationalError, match="disk I/O error"):
+            apply_wal_with_fallback(conn)
         conn.close()
 
+    def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
+        """Belt-and-suspenders: even if a marker matches, refuse to
+        downgrade when the on-disk DB header is already WAL.
+
+        Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from
+        accidentally reintroducing the mixed-journal-mode corruption
+        pattern. We construct a DB already in WAL on disk, then open a
+        new connection whose ``PRAGMA journal_mode=WAL`` raises one of
+        the legit markers — the function must still re-raise (refusing
+        the downgrade) because the on-disk file is WAL.
+        """
+        # Prime the file in WAL mode using a normal connection
+        primer = sqlite3.connect(
+            str(tmp_path / "already-wal.db"), isolation_level=None
+        )
+        try:
+            primer.execute("PRAGMA journal_mode=WAL")
+            primer.execute("CREATE TABLE t (x INTEGER)")
+            primer.execute("INSERT INTO t VALUES (1)")
+            assert (
+                primer.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
+            )
+        finally:
+            primer.close()
+
+        # New connection whose WAL pragma raises "locking protocol" — a
+        # marker that WOULD normally trigger downgrade. With the on-disk
+        # guard, we must instead re-raise.
+        conn, _ = _open_blocking(
+            tmp_path / "already-wal.db",
+            reason="locking protocol",
+            isolation_level=None,
+        )
+        with pytest.raises(sqlite3.OperationalError, match="locking protocol"):
+            apply_wal_with_fallback(conn)
+        conn.close()
+
+        # And the file is STILL WAL on disk — nothing got rewritten
+        check = sqlite3.connect(str(tmp_path / "already-wal.db"))
+        try:
+            assert (
+                check.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
+            )
+        finally:
+            check.close()
+
     def test_reraises_unrelated_operational_error(self, tmp_path):
         """Non-WAL-compat errors must NOT be silently swallowed by the fallback."""
         conn, _ = _open_blocking(