From 6f9182cb34fe2569d0006584bb3fd4cf5199bb4f Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Thu, 28 May 2026 03:22:57 -0700 Subject: [PATCH] fix(kanban): content-addressed corrupt-DB backup filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repeated quarantines of an unchanged corrupt kanban.db used to amplify disk usage by N: the gateway dispatcher's 5-minute retry loop, multi- profile fleets sharing one DB, and manual reopen attempts each produced a fresh '.corrupt..bak' copy of the same bytes. After 10 retries on a 100KB DB you had 11x the disk footprint of duplicate corrupt data. Derive the backup filename from a sha256 of the main DB instead of a timestamp + collision counter. Same bytes → same filename → skip the copy on retries. Different bytes (partial repair, further damage) → different filename → preserve separately. Sidecar (-wal/-shm) backups inherit the same content-addressed name. Inspired by @hanzckernel's PR #33529, simplified down to ~30 LOC: drop the persistent JSON marker file, drop the atomic temp+fsync+rename helper (shutil.copy2 is fine for a quarantine-only path), drop the gateway-side WAL/SHM fingerprint extension (the existing (path, mtime, size) tuple still gives the 5-minute retry semantics it needs), and drop the gateway-side helper extraction. The backup file existing IS the marker; no separate state needed. Test: tests/hermes_cli/test_kanban_db.py::test_repeated_corrupt_open_reuses_single_backup proves 10 retries on the same corrupt bytes produce 1 backup (was 11), and mutating the corrupt bytes produces a second backup with a different fingerprint. Refs #33529 Co-authored-by: hanzckernel --- hermes_cli/kanban_db.py | 51 +++++++++++++++++------------- tests/hermes_cli/test_kanban_db.py | 38 ++++++++++++++++++++++ 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 633b952ab45..cbe7f03a59e 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -71,6 +71,7 @@ new locking. from __future__ import annotations import contextlib +import hashlib import json import os import re @@ -1138,14 +1139,21 @@ class KanbanDbCorruptError(RuntimeError): def _backup_corrupt_db(path: Path) -> Optional[Path]: - """Copy a corrupt DB (and its WAL/SHM sidecars) to a timestamped backup. + """Copy a corrupt DB (and its WAL/SHM sidecars) to a content-addressed backup. + + The backup filename is deterministic in the main DB's sha256, so repeated + quarantines of the same corrupt bytes (gateway restarts, dispatcher retries, + multi-profile fleets all hitting the same shared DB) reuse one backup + instead of amplifying disk usage by N. If the corrupt bytes actually + change between attempts — e.g. a partial repair or further damage — the + fingerprint changes and a separate backup is preserved. Returns the backup path of the main DB file, or ``None`` if the copy itself failed (the caller still raises loudly in that case). - Writes are confined to the original DB's parent directory. The - backup basename is derived purely from ``path.name``, never from - caller-supplied directory segments — no traversal is possible. + Writes are confined to the original DB's parent directory. The backup + basename is derived purely from ``path.name`` and a content hash, never + from caller-supplied directory segments — no traversal is possible. """ # Resolve once and pin the parent so subsequent path operations cannot # escape it. ``Path.resolve()`` collapses any ``..`` segments and @@ -1153,32 +1161,31 @@ def _backup_corrupt_db(path: Path) -> Optional[Path]: resolved = path.resolve() parent = resolved.parent base_name = resolved.name # basename only - stamp = datetime.now().strftime("%Y%m%d_%H%M%S") - candidate = parent / f"{base_name}.corrupt.{stamp}.bak" - # Defensive: candidate must still be inside parent after construction. - # f-string interpolation of ``base_name`` cannot escape ``parent`` - # because ``base_name`` is itself a resolved basename, but assert it - # anyway so static analyzers can see the containment guarantee. - if candidate.parent != parent: - return None - counter = 0 - while candidate.exists(): - counter += 1 - candidate = parent / f"{base_name}.corrupt.{stamp}.{counter}.bak" - if candidate.parent != parent: - return None + digest = hashlib.sha256() try: - shutil.copy2(resolved, candidate) + with resolved.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) except OSError: return None + token = digest.hexdigest()[:16] + candidate = parent / f"{base_name}.corrupt.{token}.bak" + # Defensive: candidate must still be inside parent after construction. + if candidate.parent != parent: + return None + if not candidate.exists(): + try: + shutil.copy2(resolved, candidate) + except OSError: + return None for suffix in ("-wal", "-shm"): sidecar = parent / (base_name + suffix) if sidecar.parent != parent or not sidecar.exists(): continue + sidecar_backup = parent / (candidate.name + suffix) + if sidecar_backup.parent != parent or sidecar_backup.exists(): + continue try: - sidecar_backup = parent / (candidate.name + suffix) - if sidecar_backup.parent != parent: - continue shutil.copy2(sidecar, sidecar_backup) except OSError: pass diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 9e80fa1c96e..69049b20938 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -3317,6 +3317,44 @@ def test_connect_refuses_corrupt_existing_file(tmp_path): kb.connect(db_path=db_path) +def test_repeated_corrupt_open_reuses_single_backup(tmp_path): + """Repeated quarantines of the same corrupt bytes must not amplify disk usage. + + Regression for the gateway dispatcher's 5-min retry loop on shared kanban + DBs across multi-profile fleets: each retry on an unchanged corrupt file + used to create a fresh ``.corrupt..bak`` until disk filled. The + content-addressed backup name is deterministic in the DB's sha256, so + N retries of the same bytes share one backup. + """ + db_path = tmp_path / "kanban.db" + original = _write_corrupt_db(db_path) + + backups: set[Path] = set() + for _ in range(10): + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + with pytest.raises(kb.KanbanDbCorruptError) as excinfo: + kb.connect(db_path=db_path) + assert excinfo.value.backup_path is not None + backups.add(excinfo.value.backup_path) + + assert len(backups) == 1, f"expected 1 deterministic backup, got {len(backups)}" + (backup,) = backups + assert backup.exists() + assert backup.read_bytes() == original + + # Mutate the corrupt bytes — fingerprint changes, separate backup preserved. + with db_path.open("r+b") as f: + f.seek(4096) + f.write(b"\xAB" * 64) + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + with pytest.raises(kb.KanbanDbCorruptError) as excinfo2: + kb.connect(db_path=db_path) + second_backup = excinfo2.value.backup_path + assert second_backup is not None + assert second_backup != backup + assert second_backup.exists() + + def test_locked_healthy_db_does_not_classify_as_corrupt(tmp_path, monkeypatch): """A transient lock during the probe must not produce a .corrupt backup and must not be reported as :class:`KanbanDbCorruptError`. Raw sqlite