fix(kanban): preserve original exception when write_txn rollback fails

When code inside a write_txn block raises an OperationalError that SQLite
has already auto-rolled-back (typical for disk I/O error,
database is locked, and database disk image is malformed), the
explicit ROLLBACK in write_txn.__exit__ itself raises
cannot rollback - no transaction is active and the secondary exception
replaces the original in the traceback. Operators see a misleading error
and lose the diagnostic information they need.

Swallow the rollback-time OperationalError so the caller always sees the
original cause.

Confirmed reproducer: tests/hermes_cli/test_kanban_db.py::
test_write_txn_preserves_original_exception_when_rollback_fails
This commit is contained in:
Stephen Chin 2026-05-23 21:56:07 -07:00 committed by kshitij
parent 5c49cd0ed0
commit e83252dc46
2 changed files with 68 additions and 1 deletions

View file

@ -1481,12 +1481,22 @@ def write_txn(conn: sqlite3.Connection):
Use for any multi-statement write (creating a task + link, claiming a
task + recording an event, etc.). A claim CAS inside this context is
atomic -- at most one concurrent writer can succeed.
The explicit ROLLBACK on exception is wrapped in try/except so that
a SQLite auto-rollback (which leaves no active transaction) does not
shadow the original exception with a spurious rollback error.
"""
conn.execute("BEGIN IMMEDIATE")
try:
yield conn
except Exception:
conn.execute("ROLLBACK")
try:
conn.execute("ROLLBACK")
except sqlite3.OperationalError:
# SQLite has already auto-rolled-back the transaction (typical
# under EIO, lock contention, or corruption). Nothing to undo;
# do not let this secondary failure shadow the real one.
pass
raise
else:
conn.execute("COMMIT")

View file

@ -3417,3 +3417,60 @@ def test_pragmas_not_accidentally_disabled_by_migrate_path(tmp_path):
assert conn.execute("PRAGMA cell_size_check").fetchone()[0] == 1
assert conn.execute("PRAGMA synchronous").fetchone()[0] == 2
# write_txn — rollback handler must not mask the original exception
# ---------------------------------------------------------------------------
def test_write_txn_preserves_original_exception_when_rollback_fails(kanban_home):
"""When a write inside write_txn raises an OperationalError that SQLite
has already auto-rolled-back (e.g. ``disk I/O error``,
``database is locked``, ``database disk image is malformed``), the
explicit ROLLBACK in ``write_txn.__exit__`` itself raises
``cannot rollback - no transaction is active``. The original cause
must NOT be masked by the secondary rollback failure operators rely
on the original cause to diagnose the underlying issue.
"""
class FailingConnWrapper:
"""Delegate to a real connection, simulating an EIO during an INSERT
that SQLite has already auto-rolled-back."""
def __init__(self, real):
self._real = real
self._fail_armed = True
def execute(self, sql, *args, **kwargs):
if (
self._fail_armed
and sql.lstrip().upper().startswith("INSERT")
and "task_events" in sql.lower()
):
self._fail_armed = False # one-shot
# Simulate SQLite auto-rolling back the transaction by
# issuing a real ROLLBACK now. After this, BEGIN IMMEDIATE
# is no longer active and an explicit ROLLBACK would error.
try:
self._real.execute("ROLLBACK")
except sqlite3.OperationalError:
pass
raise sqlite3.OperationalError("disk I/O error")
return self._real.execute(sql, *args, **kwargs)
def __getattr__(self, name):
return getattr(self._real, name)
with kb.connect() as conn:
wrapper = FailingConnWrapper(conn)
with pytest.raises(sqlite3.OperationalError) as excinfo:
with kb.write_txn(wrapper):
kb._append_event(wrapper, "t_bogus", "promoted", None)
msg = str(excinfo.value)
assert "disk I/O error" in msg, (
f"write_txn masked the original exception with rollback failure; "
f"got {msg!r} (expected to contain 'disk I/O error')"
)
assert "cannot rollback" not in msg, (
f"write_txn surfaced the rollback failure instead of the original "
f"OperationalError; got {msg!r}"
)