mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(kanban): skip redundant WAL pragma on already-WAL connections
apply_wal_with_fallback() issued PRAGMA journal_mode=WAL on every call,
including connections to DBs already in WAL mode. This triggered the WAL
init code path, causing SQLite to acquire EXCLUSIVE, checkpoint, and unlink
kanban.db-{wal,shm}. Other open connections received (deleted) FDs and
raised sqlite3.OperationalError: disk I/O error.
Add a cheap read probe (PRAGMA journal_mode, no flock/checkpoint/unlink)
before the set-pragma path. If already wal, return early. The set-pragma
and DELETE fallback paths are unchanged.
Closes #31158. Addresses root cause that PRs #32226 and #32322 attempted
via connection-sharing/caching approaches.
This commit is contained in:
parent
ffdc937c18
commit
dc98314fbd
3 changed files with 250 additions and 14 deletions
|
|
@ -3021,3 +3021,223 @@ class TestFTS5ToolCallMigration:
|
|||
finally:
|
||||
session_db.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# apply_wal_with_fallback — read-only probe tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestApplyWalProbe:
|
||||
"""Unit tests for the journal_mode probe in apply_wal_with_fallback."""
|
||||
|
||||
def test_skips_set_pragma_when_already_wal(self, tmp_path):
|
||||
"""Already-WAL connection must not trigger the set-pragma."""
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "wal.db"
|
||||
# Prime the file into WAL mode first.
|
||||
with sqlite3.connect(str(db_path)) as seed:
|
||||
seed.execute("PRAGMA journal_mode=WAL")
|
||||
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
# Only the probe should have fired; the set-pragma must NOT appear.
|
||||
assert any("PRAGMA journal_mode" == sql.strip() for sql in conn.executed), (
|
||||
"probe PRAGMA should have run"
|
||||
)
|
||||
assert not any("journal_mode=WAL" in sql for sql in conn.executed), (
|
||||
"set-pragma must not run when already in WAL mode"
|
||||
)
|
||||
|
||||
def test_sets_wal_on_fresh_connection(self, tmp_path):
|
||||
"""Probe sees 'delete', then set-pragma runs and returns 'wal'."""
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "fresh.db"
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
assert any("journal_mode=WAL" in sql for sql in conn.executed), (
|
||||
"set-pragma must fire on a fresh (non-WAL) connection"
|
||||
)
|
||||
|
||||
def test_apply_wal_concurrent_connects_no_eio(self, tmp_path):
|
||||
"""20 threads calling connect() on the same DB must not see disk I/O error."""
|
||||
import sys
|
||||
import threading
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
db_path = tmp_path / "concurrent.db"
|
||||
errors = []
|
||||
|
||||
def _connect_cycle():
|
||||
for _ in range(5):
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
apply_wal_with_fallback(conn)
|
||||
conn.close()
|
||||
except sqlite3.OperationalError as exc:
|
||||
if "disk i/o error" in str(exc).lower():
|
||||
errors.append(exc)
|
||||
|
||||
threads = [threading.Thread(target=_connect_cycle) for _ in range(20)]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert not errors, f"disk I/O errors from concurrent connects: {errors}"
|
||||
|
||||
# Linux-only: no (deleted) WAL/SHM FDs should accumulate.
|
||||
if sys.platform == "linux":
|
||||
import os
|
||||
|
||||
fd_dir = f"/proc/{os.getpid()}/fd"
|
||||
deleted_fds = []
|
||||
for fd_name in os.listdir(fd_dir):
|
||||
try:
|
||||
target = os.readlink(os.path.join(fd_dir, fd_name))
|
||||
if "(deleted)" in target and (
|
||||
"wal" in target.lower() or "shm" in target.lower()
|
||||
):
|
||||
deleted_fds.append(target)
|
||||
except OSError:
|
||||
pass
|
||||
assert not deleted_fds, f"stale deleted WAL/SHM FDs: {deleted_fds}"
|
||||
|
||||
def test_fallback_to_delete_still_works(self, tmp_path):
|
||||
"""When set-pragma raises a WAL-incompat error, falls back to DELETE."""
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _IncompatConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self._call_count = 0
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self._call_count += 1
|
||||
# First call is the read probe; let it return "delete".
|
||||
# Second call is the set-pragma; raise a WAL-incompat error.
|
||||
if "journal_mode=WAL" in sql:
|
||||
raise sqlite3.OperationalError("locking protocol")
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "incompat.db"
|
||||
conn = _IncompatConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn, db_label="test.db")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "delete"
|
||||
|
||||
def test_probe_failure_falls_through_to_set_pragma(self, tmp_path):
|
||||
"""When the read probe raises OperationalError, fall through to set-pragma."""
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _ProbeFails(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self._first = True
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
if self._first and "journal_mode" in sql and "WAL" not in sql:
|
||||
self._first = False
|
||||
raise sqlite3.OperationalError("simulated probe failure")
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "probe_fail.db"
|
||||
conn = _ProbeFails(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Despite probe failure, set-pragma must still run and succeed.
|
||||
assert result == "wal"
|
||||
|
||||
def test_no_downgrade_from_wal_to_delete_on_eio(self, tmp_path):
|
||||
"""OperationalError NOT in _WAL_INCOMPAT_MARKERS must propagate, not downgrade."""
|
||||
import sqlite3
|
||||
import pytest
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _EIOConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self._first = True
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
# Let the probe succeed (returns "delete" for fresh DB).
|
||||
if "journal_mode=WAL" in sql:
|
||||
raise sqlite3.OperationalError("some unexpected hardware failure")
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "eio.db"
|
||||
conn = _EIOConn(str(db_path))
|
||||
try:
|
||||
with pytest.raises(
|
||||
sqlite3.OperationalError, match="some unexpected hardware failure"
|
||||
):
|
||||
apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_returns_wal_not_delete_from_probe(self, tmp_path):
|
||||
"""Early-return only on 'wal'; 'delete' or 'memory' must fall through to set-pragma."""
|
||||
import sqlite3
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
# Fresh DB is in "delete" mode — probe returns "delete", must NOT early-return.
|
||||
db_path = tmp_path / "delete_mode.db"
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
assert any("journal_mode=WAL" in sql for sql in conn.executed), (
|
||||
"set-pragma must fire when probe returns 'delete'"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -131,15 +131,17 @@ class TestApplyWalWithFallback:
|
|||
conn.close()
|
||||
|
||||
def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
|
||||
"""Belt-and-suspenders: even if a marker matches, refuse to
|
||||
downgrade when the on-disk DB header is already WAL.
|
||||
"""Refuse to downgrade an already-WAL DB even if the set-pragma path
|
||||
would have raised a downgrade-eligible marker.
|
||||
|
||||
Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from
|
||||
accidentally reintroducing the mixed-journal-mode corruption
|
||||
pattern. We construct a DB already in WAL on disk, then open a
|
||||
new connection whose ``PRAGMA journal_mode=WAL`` raises one of
|
||||
the legit markers — the function must still re-raise (refusing
|
||||
the downgrade) because the on-disk file is WAL.
|
||||
With the WAL-skip patch, the read-only probe short-circuits before
|
||||
``PRAGMA journal_mode=WAL`` ever runs on an already-WAL connection,
|
||||
so the set-pragma path is unreachable here and ``attempts`` stays 0.
|
||||
Either outcome (skip-via-probe OR re-raise-on-disk-check) preserves
|
||||
the property this test guards: we never silently DELETE-downgrade
|
||||
a WAL-mode file. The on-disk guard remains in place as
|
||||
belt-and-suspenders for any future code path that bypasses the
|
||||
probe.
|
||||
"""
|
||||
# Prime the file in WAL mode using a normal connection
|
||||
primer = sqlite3.connect(
|
||||
|
|
@ -155,16 +157,21 @@ class TestApplyWalWithFallback:
|
|||
finally:
|
||||
primer.close()
|
||||
|
||||
# New connection whose WAL pragma raises "locking protocol" — a
|
||||
# marker that WOULD normally trigger downgrade. With the on-disk
|
||||
# guard, we must instead re-raise.
|
||||
conn, _ = _open_blocking(
|
||||
# New connection whose set-WAL pragma would raise "locking protocol"
|
||||
# if it were ever called. With the WAL-skip patch the probe sees
|
||||
# journal_mode=wal and returns early, so set-WAL is never attempted.
|
||||
conn, attempts = _open_blocking(
|
||||
tmp_path / "already-wal.db",
|
||||
reason="locking protocol",
|
||||
isolation_level=None,
|
||||
)
|
||||
with pytest.raises(sqlite3.OperationalError, match="locking protocol"):
|
||||
apply_wal_with_fallback(conn)
|
||||
result = apply_wal_with_fallback(conn)
|
||||
assert result == "wal", (
|
||||
"must report wal mode (either skipped via probe or refused downgrade)"
|
||||
)
|
||||
assert attempts[0] == 0, (
|
||||
"set-WAL pragma must not run when the on-disk header already says wal"
|
||||
)
|
||||
conn.close()
|
||||
|
||||
# And the file is STILL WAL on disk — nothing got rewritten
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue