From dc98314fbd4b8690fdeb07d6d73677c357f2a06d Mon Sep 17 00:00:00 2001 From: Stephen Chin Date: Mon, 25 May 2026 23:26:17 -0700 Subject: [PATCH] fix(kanban): skip redundant WAL pragma on already-WAL connections apply_wal_with_fallback() issued PRAGMA journal_mode=WAL on every call, including connections to DBs already in WAL mode. This triggered the WAL init code path, causing SQLite to acquire EXCLUSIVE, checkpoint, and unlink kanban.db-{wal,shm}. Other open connections received (deleted) FDs and raised sqlite3.OperationalError: disk I/O error. Add a cheap read probe (PRAGMA journal_mode, no flock/checkpoint/unlink) before the set-pragma path. If already wal, return early. The set-pragma and DELETE fallback paths are unchanged. Closes #31158. Addresses root cause that PRs #32226 and #32322 attempted via connection-sharing/caching approaches. --- hermes_state.py | 9 + tests/test_hermes_state.py | 220 ++++++++++++++++++++++++ tests/test_hermes_state_wal_fallback.py | 35 ++-- 3 files changed, 250 insertions(+), 14 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index 709a8de86e0..ba33598b91e 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -170,6 +170,15 @@ def apply_wal_with_fallback( Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode. """ + # Read-only probe — no flock, no checkpoint, no WAL/SHM unlink. + # Skipping the set-pragma prevents WAL-init from unlinking files other connections hold open. + try: + current_mode = conn.execute("PRAGMA journal_mode").fetchone() + if current_mode and current_mode[0] == "wal": + return "wal" + except sqlite3.OperationalError: + pass + try: conn.execute("PRAGMA journal_mode=WAL") return "wal" diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index baabef000d2..d0815762175 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -3021,3 +3021,223 @@ class TestFTS5ToolCallMigration: finally: session_db.close() + +# --------------------------------------------------------------------------- +# apply_wal_with_fallback — read-only probe tests +# --------------------------------------------------------------------------- + + +class TestApplyWalProbe: + """Unit tests for the journal_mode probe in apply_wal_with_fallback.""" + + def test_skips_set_pragma_when_already_wal(self, tmp_path): + """Already-WAL connection must not trigger the set-pragma.""" + import sqlite3 + from hermes_state import apply_wal_with_fallback + + class _TracingConn(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self.executed = [] + + def execute(self, sql, params=()): + self.executed.append(sql) + return super().execute(sql, params) + + db_path = tmp_path / "wal.db" + # Prime the file into WAL mode first. + with sqlite3.connect(str(db_path)) as seed: + seed.execute("PRAGMA journal_mode=WAL") + + conn = _TracingConn(str(db_path)) + try: + result = apply_wal_with_fallback(conn) + finally: + conn.close() + + assert result == "wal" + # Only the probe should have fired; the set-pragma must NOT appear. + assert any("PRAGMA journal_mode" == sql.strip() for sql in conn.executed), ( + "probe PRAGMA should have run" + ) + assert not any("journal_mode=WAL" in sql for sql in conn.executed), ( + "set-pragma must not run when already in WAL mode" + ) + + def test_sets_wal_on_fresh_connection(self, tmp_path): + """Probe sees 'delete', then set-pragma runs and returns 'wal'.""" + import sqlite3 + from hermes_state import apply_wal_with_fallback + + class _TracingConn(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self.executed = [] + + def execute(self, sql, params=()): + self.executed.append(sql) + return super().execute(sql, params) + + db_path = tmp_path / "fresh.db" + conn = _TracingConn(str(db_path)) + try: + result = apply_wal_with_fallback(conn) + finally: + conn.close() + + assert result == "wal" + assert any("journal_mode=WAL" in sql for sql in conn.executed), ( + "set-pragma must fire on a fresh (non-WAL) connection" + ) + + def test_apply_wal_concurrent_connects_no_eio(self, tmp_path): + """20 threads calling connect() on the same DB must not see disk I/O error.""" + import sys + import threading + import sqlite3 + from hermes_state import apply_wal_with_fallback + + db_path = tmp_path / "concurrent.db" + errors = [] + + def _connect_cycle(): + for _ in range(5): + try: + conn = sqlite3.connect(str(db_path)) + apply_wal_with_fallback(conn) + conn.close() + except sqlite3.OperationalError as exc: + if "disk i/o error" in str(exc).lower(): + errors.append(exc) + + threads = [threading.Thread(target=_connect_cycle) for _ in range(20)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"disk I/O errors from concurrent connects: {errors}" + + # Linux-only: no (deleted) WAL/SHM FDs should accumulate. + if sys.platform == "linux": + import os + + fd_dir = f"/proc/{os.getpid()}/fd" + deleted_fds = [] + for fd_name in os.listdir(fd_dir): + try: + target = os.readlink(os.path.join(fd_dir, fd_name)) + if "(deleted)" in target and ( + "wal" in target.lower() or "shm" in target.lower() + ): + deleted_fds.append(target) + except OSError: + pass + assert not deleted_fds, f"stale deleted WAL/SHM FDs: {deleted_fds}" + + def test_fallback_to_delete_still_works(self, tmp_path): + """When set-pragma raises a WAL-incompat error, falls back to DELETE.""" + import sqlite3 + from hermes_state import apply_wal_with_fallback + + class _IncompatConn(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self._call_count = 0 + + def execute(self, sql, params=()): + self._call_count += 1 + # First call is the read probe; let it return "delete". + # Second call is the set-pragma; raise a WAL-incompat error. + if "journal_mode=WAL" in sql: + raise sqlite3.OperationalError("locking protocol") + return super().execute(sql, params) + + db_path = tmp_path / "incompat.db" + conn = _IncompatConn(str(db_path)) + try: + result = apply_wal_with_fallback(conn, db_label="test.db") + finally: + conn.close() + + assert result == "delete" + + def test_probe_failure_falls_through_to_set_pragma(self, tmp_path): + """When the read probe raises OperationalError, fall through to set-pragma.""" + import sqlite3 + from hermes_state import apply_wal_with_fallback + + class _ProbeFails(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self._first = True + + def execute(self, sql, params=()): + if self._first and "journal_mode" in sql and "WAL" not in sql: + self._first = False + raise sqlite3.OperationalError("simulated probe failure") + return super().execute(sql, params) + + db_path = tmp_path / "probe_fail.db" + conn = _ProbeFails(str(db_path)) + try: + result = apply_wal_with_fallback(conn) + finally: + conn.close() + + # Despite probe failure, set-pragma must still run and succeed. + assert result == "wal" + + def test_no_downgrade_from_wal_to_delete_on_eio(self, tmp_path): + """OperationalError NOT in _WAL_INCOMPAT_MARKERS must propagate, not downgrade.""" + import sqlite3 + import pytest + from hermes_state import apply_wal_with_fallback + + class _EIOConn(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self._first = True + + def execute(self, sql, params=()): + # Let the probe succeed (returns "delete" for fresh DB). + if "journal_mode=WAL" in sql: + raise sqlite3.OperationalError("some unexpected hardware failure") + return super().execute(sql, params) + + db_path = tmp_path / "eio.db" + conn = _EIOConn(str(db_path)) + try: + with pytest.raises( + sqlite3.OperationalError, match="some unexpected hardware failure" + ): + apply_wal_with_fallback(conn) + finally: + conn.close() + + def test_returns_wal_not_delete_from_probe(self, tmp_path): + """Early-return only on 'wal'; 'delete' or 'memory' must fall through to set-pragma.""" + import sqlite3 + from hermes_state import apply_wal_with_fallback + + class _TracingConn(sqlite3.Connection): + def __init__(self, *a, **kw): + super().__init__(*a, **kw) + self.executed = [] + + def execute(self, sql, params=()): + self.executed.append(sql) + return super().execute(sql, params) + + # Fresh DB is in "delete" mode — probe returns "delete", must NOT early-return. + db_path = tmp_path / "delete_mode.db" + conn = _TracingConn(str(db_path)) + try: + result = apply_wal_with_fallback(conn) + finally: + conn.close() + + assert result == "wal" + assert any("journal_mode=WAL" in sql for sql in conn.executed), ( + "set-pragma must fire when probe returns 'delete'" + ) diff --git a/tests/test_hermes_state_wal_fallback.py b/tests/test_hermes_state_wal_fallback.py index 9dfc718acb1..5678e3ff4f1 100644 --- a/tests/test_hermes_state_wal_fallback.py +++ b/tests/test_hermes_state_wal_fallback.py @@ -131,15 +131,17 @@ class TestApplyWalWithFallback: conn.close() def test_does_not_downgrade_when_disk_says_wal(self, tmp_path): - """Belt-and-suspenders: even if a marker matches, refuse to - downgrade when the on-disk DB header is already WAL. + """Refuse to downgrade an already-WAL DB even if the set-pragma path + would have raised a downgrade-eligible marker. - Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from - accidentally reintroducing the mixed-journal-mode corruption - pattern. We construct a DB already in WAL on disk, then open a - new connection whose ``PRAGMA journal_mode=WAL`` raises one of - the legit markers — the function must still re-raise (refusing - the downgrade) because the on-disk file is WAL. + With the WAL-skip patch, the read-only probe short-circuits before + ``PRAGMA journal_mode=WAL`` ever runs on an already-WAL connection, + so the set-pragma path is unreachable here and ``attempts`` stays 0. + Either outcome (skip-via-probe OR re-raise-on-disk-check) preserves + the property this test guards: we never silently DELETE-downgrade + a WAL-mode file. The on-disk guard remains in place as + belt-and-suspenders for any future code path that bypasses the + probe. """ # Prime the file in WAL mode using a normal connection primer = sqlite3.connect( @@ -155,16 +157,21 @@ class TestApplyWalWithFallback: finally: primer.close() - # New connection whose WAL pragma raises "locking protocol" — a - # marker that WOULD normally trigger downgrade. With the on-disk - # guard, we must instead re-raise. - conn, _ = _open_blocking( + # New connection whose set-WAL pragma would raise "locking protocol" + # if it were ever called. With the WAL-skip patch the probe sees + # journal_mode=wal and returns early, so set-WAL is never attempted. + conn, attempts = _open_blocking( tmp_path / "already-wal.db", reason="locking protocol", isolation_level=None, ) - with pytest.raises(sqlite3.OperationalError, match="locking protocol"): - apply_wal_with_fallback(conn) + result = apply_wal_with_fallback(conn) + assert result == "wal", ( + "must report wal mode (either skipped via probe or refused downgrade)" + ) + assert attempts[0] == 0, ( + "set-WAL pragma must not run when the on-disk header already says wal" + ) conn.close() # And the file is STILL WAL on disk — nothing got rewritten