From 39fe4ecee3d0876ca16a4a031349140733264302 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 22 May 2026 22:22:27 -0400 Subject: [PATCH] fix(kanban): refuse corrupt db auto-init --- hermes_cli/kanban_db.py | 99 ++++++++++++++++++++++++++++ tests/hermes_cli/test_kanban_db.py | 101 +++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 7a30b70987f..c2447bdcb21 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -75,6 +75,7 @@ import json import os import re import secrets +import shutil import sqlite3 import subprocess import sys @@ -82,6 +83,7 @@ import threading import logging import time from dataclasses import dataclass, field +from datetime import datetime from pathlib import Path from typing import Any, Iterable, Optional @@ -1005,6 +1007,97 @@ def _validate_sqlite_header(path: Path) -> None: ) +class KanbanDbCorruptError(RuntimeError): + """Raised when an existing kanban DB file fails integrity checks. + + Fail-closed guard against silent recreation of a corrupt board file, + which would otherwise destroy the user's tasks. Carries both the + original path and the timestamped backup we made before refusing. + """ + + def __init__(self, db_path: Path, backup_path: Optional[Path], reason: str): + self.db_path = db_path + self.backup_path = backup_path + self.reason = reason + backup_str = str(backup_path) if backup_path is not None else "" + super().__init__( + f"Refusing to open corrupt kanban DB at {db_path}: {reason}. " + f"Original preserved; backup at {backup_str}." + ) + + +def _backup_corrupt_db(path: Path) -> Optional[Path]: + """Copy a corrupt DB (and its WAL/SHM sidecars) to a timestamped backup. + + Returns the backup path of the main DB file, or ``None`` if the copy + itself failed (the caller still raises loudly in that case). + """ + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + candidate = path.with_name(f"{path.name}.corrupt.{stamp}.bak") + counter = 0 + while candidate.exists(): + counter += 1 + candidate = path.with_name( + f"{path.name}.corrupt.{stamp}.{counter}.bak" + ) + try: + shutil.copy2(path, candidate) + except OSError: + return None + for suffix in ("-wal", "-shm"): + sidecar = path.with_name(path.name + suffix) + if sidecar.exists(): + try: + shutil.copy2(sidecar, candidate.with_name(candidate.name + suffix)) + except OSError: + pass + return candidate + + +def _guard_existing_db_is_healthy(path: Path) -> None: + """Run ``PRAGMA integrity_check`` on an existing non-empty DB file. + + Opens the probe in read/write mode so SQLite can recover or + checkpoint a healthy WAL/hot-journal DB before we declare it + corrupt. If the file is malformed, copy it (and any WAL/SHM + sidecars) to a timestamped backup and raise + :class:`KanbanDbCorruptError` so callers cannot silently recreate + the schema on top of a damaged DB. + + Transient lock/busy errors (``sqlite3.OperationalError``) are NOT + treated as corruption; they propagate raw so the caller sees a + normal lock failure and no spurious ``.corrupt`` backup is made. + + No-op for missing files, zero-byte files (treated as fresh), and + paths already proven healthy this process (cache hit). + """ + try: + if not path.exists() or path.stat().st_size == 0: + return + except OSError: + return + if str(path.resolve()) in _INITIALIZED_PATHS: + return + reason: Optional[str] = None + try: + probe = sqlite3.connect(str(path), timeout=5, isolation_level=None) + try: + row = probe.execute("PRAGMA integrity_check").fetchone() + finally: + probe.close() + if not row or (row[0] or "").lower() != "ok": + reason = f"integrity_check returned {row[0] if row else ''!r}" + except sqlite3.OperationalError: + # Lock contention, busy, transient IO — not corruption. Let it propagate. + raise + except sqlite3.DatabaseError as exc: + reason = f"sqlite refused to open file: {exc}" + if reason is None: + return + backup = _backup_corrupt_db(path) + raise KanbanDbCorruptError(path, backup, reason) + + def connect( db_path: Optional[Path] = None, *, @@ -1033,7 +1126,13 @@ def connect( else: path = kanban_db_path(board=board) path.parent.mkdir(parents=True, exist_ok=True) + # Cheap byte-level check first — catches the #29507 TLS-overwrite shape + # and other invalid-header cases without opening a sqlite connection. _validate_sqlite_header(path) + # Full integrity probe — catches corruption past the header (malformed + # pages, broken internal metadata). Cached per-path after first success + # via _INITIALIZED_PATHS so it only runs once per process per path. + _guard_existing_db_is_healthy(path) resolved = str(path.resolve()) conn = sqlite3.connect(str(path), isolation_level=None, timeout=30) try: diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 435ef41001a..24b553e9e65 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -2981,3 +2981,104 @@ def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch): assert "stale" in kinds, ( f"Expected 'stale' event in task_events; got {kinds!r}" ) + + +# --------------------------------------------------------------------------- +# Corruption guard (issue #30687) +# --------------------------------------------------------------------------- + +def _write_corrupt_db(path: Path) -> bytes: + """Write a kanban DB with a VALID SQLite header but malformed page content. + + This is the corruption shape the integrity guard specifically targets + (e.g. issue #29507 follow-up reports where the file's first 16 bytes + pass the header byte check but ``PRAGMA integrity_check`` then fails + because the internal pages are damaged). It's what main's header-only + validator was letting through, and what this PR adds the full guard + for. + """ + # 100-byte SQLite header (magic + minimal valid-looking fields) so the + # cheap header check passes, then deliberate garbage so sqlite refuses + # to read the file past the header. + header = b"SQLite format 3\x00" + b"\x10\x00\x02\x02\x00\x40\x20\x20" + header += b"\x00\x00\x00\x0c\x00\x00\x23\x46\x00\x00\x00\x00" + header = header.ljust(100, b"\x00") + payload = b"definitely not a valid sqlite page \x00\x01\x02\x03" * 64 + blob = header + payload + path.write_bytes(blob) + return blob + + +def test_init_db_refuses_corrupt_existing_file(tmp_path): + db_path = tmp_path / "kanban.db" + original = _write_corrupt_db(db_path) + # Ensure the cache doesn't mask the guard. + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + + with pytest.raises(kb.KanbanDbCorruptError) as excinfo: + kb.init_db(db_path=db_path) + + err = excinfo.value + assert err.db_path == db_path + assert err.backup_path is not None + assert err.backup_path.exists() + assert err.backup_path.read_bytes() == original + # Original bytes untouched — no schema was written on top. + assert db_path.read_bytes() == original + assert str(db_path) in str(err) + assert str(err.backup_path) in str(err) + + +def test_connect_refuses_corrupt_existing_file(tmp_path): + db_path = tmp_path / "kanban.db" + _write_corrupt_db(db_path) + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + + with pytest.raises(kb.KanbanDbCorruptError): + kb.connect(db_path=db_path) + + +def test_locked_healthy_db_does_not_classify_as_corrupt(tmp_path, monkeypatch): + """A transient lock during the probe must not produce a .corrupt backup + and must not be reported as :class:`KanbanDbCorruptError`. Raw sqlite + ``OperationalError`` (lock/busy) is acceptable and expected.""" + db_path = tmp_path / "kanban.db" + kb.init_db(db_path=db_path) + kb._INITIALIZED_PATHS.discard(str(db_path.resolve())) + + real_connect = sqlite3.connect + + def flaky_connect(*args, **kwargs): + # First call is the integrity probe — simulate a lock. + raise sqlite3.OperationalError("database is locked") + + monkeypatch.setattr(kb.sqlite3, "connect", flaky_connect) + + with pytest.raises(sqlite3.OperationalError): + kb.connect(db_path=db_path) + + # No .corrupt backup may be produced for a healthy-but-locked DB. + backups = list(tmp_path.glob("*.corrupt.*")) + assert backups == [], f"unexpected corrupt backups: {backups}" + + # And once the lock clears, normal access still works. + monkeypatch.setattr(kb.sqlite3, "connect", real_connect) + with kb.connect(db_path=db_path) as conn: + kb.create_task(conn, title="still here") + titles = [t.title for t in kb.list_tasks(conn)] + assert "still here" in titles + + +def test_init_db_allows_missing_then_healthy(tmp_path): + db_path = tmp_path / "fresh.db" + assert not db_path.exists() + kb.init_db(db_path=db_path) + assert db_path.exists() and db_path.stat().st_size > 0 + + # Idempotent on a healthy DB: data survives a second init. + with kb.connect(db_path=db_path) as conn: + kb.create_task(conn, title="keeps") + kb.init_db(db_path=db_path) + with kb.connect(db_path=db_path) as conn: + tasks = kb.list_tasks(conn) + assert [t.title for t in tasks] == ["keeps"]