fix(kanban): harden SQLite against torn-write corruption (secure_delete + cell_size_check + synchronous=FULL)

Production corruption #6 left b-tree pages with zeroed headers but intact old cell content — the Bug E pattern. This fix applies three pragma calls on every connect():

- synchronous=FULL (was NORMAL): closes the WAL-checkpoint reordering window where a crash between WAL commit and main-DB write leaves a partially-written b-tree page header. Cost is <1ms per commit on local SSD; negligible at kanban write volume.

- secure_delete=ON: forces SQLite to zero freed page bytes on disk. If a torn write or hardware fault later corrupts a page, the underlying cell content is zero, so corruption is detectable and no stale rows can resurface as live data.

- cell_size_check=ON: adds a read-side guard so corrupt cells surface as errors at read time rather than as silent wrong-data returns.

All three are connection-scoped and re-applied on every connect(). secure_delete also writes a persistent flag into the DB header on the first call against a fresh DB, making the protection durable across processes for new DBs.

Tests added for all four required cases: each pragma active on a fresh connection, and all three re-applied after close+reopen. Also adds the required negative test (migration path does not reset pragmas).
This commit is contained in:
Stephen Chin 2026-05-23 15:45:03 -07:00 committed by kshitij
parent 963d22cde6
commit 6416dd5187
2 changed files with 73 additions and 1 deletions

View file

@ -1181,8 +1181,16 @@ def connect(
# See hermes_state._WAL_INCOMPAT_MARKERS for detection logic.
from hermes_state import apply_wal_with_fallback
apply_wal_with_fallback(conn, db_label=f"kanban.db ({path.name})")
conn.execute("PRAGMA synchronous=NORMAL")
# FULL (was NORMAL): fsync before each checkpoint to narrow the
# crash window that can leave a b-tree page header torn.
conn.execute("PRAGMA synchronous=FULL")
conn.execute("PRAGMA foreign_keys=ON")
# Zero freed pages so a later torn write cannot expose stale
# cell content; persisted in the DB header for new DBs.
conn.execute("PRAGMA secure_delete=ON")
# Surface corrupt cells as read errors instead of silent
# wrong-data returns.
conn.execute("PRAGMA cell_size_check=ON")
needs_init = resolved not in _INITIALIZED_PATHS
if needs_init:
# Idempotent: runs CREATE TABLE IF NOT EXISTS + the additive

View file

@ -3339,3 +3339,67 @@ def test_maybe_emit_scratch_tip_skips_non_scratch_workspaces(kanban_home, caplog
).fetchall()
assert "tip_scratch_workspace" not in [e["kind"] for e in events]
# ---------------------------------------------------------------------------
# Connection pragmas (secure_delete, cell_size_check, synchronous=FULL)
# ---------------------------------------------------------------------------
def test_connect_sets_secure_delete_on(tmp_path):
"""secure_delete=ON must be active on every new connection."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect(db_path=db_path) as conn:
row = conn.execute("PRAGMA secure_delete").fetchone()
assert row[0] == 1, f"expected secure_delete=1, got {row[0]}"
def test_connect_sets_cell_size_check_on(tmp_path):
"""cell_size_check=ON must be active on every new connection."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect(db_path=db_path) as conn:
row = conn.execute("PRAGMA cell_size_check").fetchone()
assert row[0] == 1, f"expected cell_size_check=1, got {row[0]}"
def test_connect_sets_synchronous_full(tmp_path):
"""synchronous must be FULL (=2), not NORMAL (=1)."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect(db_path=db_path) as conn:
row = conn.execute("PRAGMA synchronous").fetchone()
assert row[0] == 2, f"expected synchronous=2 (FULL), got {row[0]}"
def test_connect_pragmas_applied_on_reconnect(tmp_path):
"""All three pragmas must be re-applied on every connect(), not just the first."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
# First connection: write a task and close.
with kb.connect(db_path=db_path) as conn:
kb.create_task(conn, title="reconnect-check")
# Force re-init path by discarding path cache.
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
# Second connection: pragmas must still be applied.
with kb.connect(db_path=db_path) as conn:
assert conn.execute("PRAGMA secure_delete").fetchone()[0] == 1
assert conn.execute("PRAGMA cell_size_check").fetchone()[0] == 1
assert conn.execute("PRAGMA synchronous").fetchone()[0] == 2
def test_pragmas_not_accidentally_disabled_by_migrate_path(tmp_path):
"""Migration path must not reset connection pragmas."""
db_path = tmp_path / "legacy.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
# Initialise with a fresh connect so schema + init run.
with kb.connect(db_path=db_path) as conn:
kb.create_task(conn, title="pre-migration-task")
# Simulate a re-entry through the init/migration path by discarding path cache.
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect(db_path=db_path) as conn:
assert conn.execute("PRAGMA secure_delete").fetchone()[0] == 1
assert conn.execute("PRAGMA cell_size_check").fetchone()[0] == 1
assert conn.execute("PRAGMA synchronous").fetchone()[0] == 2