mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(kanban): refuse corrupt db auto-init
This commit is contained in:
parent
e97a4c8f37
commit
39fe4ecee3
2 changed files with 200 additions and 0 deletions
|
|
@ -75,6 +75,7 @@ import json
|
|||
import os
|
||||
import re
|
||||
import secrets
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
|
|
@ -82,6 +83,7 @@ import threading
|
|||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
|
|
@ -1005,6 +1007,97 @@ def _validate_sqlite_header(path: Path) -> None:
|
|||
)
|
||||
|
||||
|
||||
class KanbanDbCorruptError(RuntimeError):
|
||||
"""Raised when an existing kanban DB file fails integrity checks.
|
||||
|
||||
Fail-closed guard against silent recreation of a corrupt board file,
|
||||
which would otherwise destroy the user's tasks. Carries both the
|
||||
original path and the timestamped backup we made before refusing.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path, backup_path: Optional[Path], reason: str):
|
||||
self.db_path = db_path
|
||||
self.backup_path = backup_path
|
||||
self.reason = reason
|
||||
backup_str = str(backup_path) if backup_path is not None else "<backup failed>"
|
||||
super().__init__(
|
||||
f"Refusing to open corrupt kanban DB at {db_path}: {reason}. "
|
||||
f"Original preserved; backup at {backup_str}."
|
||||
)
|
||||
|
||||
|
||||
def _backup_corrupt_db(path: Path) -> Optional[Path]:
|
||||
"""Copy a corrupt DB (and its WAL/SHM sidecars) to a timestamped backup.
|
||||
|
||||
Returns the backup path of the main DB file, or ``None`` if the copy
|
||||
itself failed (the caller still raises loudly in that case).
|
||||
"""
|
||||
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
candidate = path.with_name(f"{path.name}.corrupt.{stamp}.bak")
|
||||
counter = 0
|
||||
while candidate.exists():
|
||||
counter += 1
|
||||
candidate = path.with_name(
|
||||
f"{path.name}.corrupt.{stamp}.{counter}.bak"
|
||||
)
|
||||
try:
|
||||
shutil.copy2(path, candidate)
|
||||
except OSError:
|
||||
return None
|
||||
for suffix in ("-wal", "-shm"):
|
||||
sidecar = path.with_name(path.name + suffix)
|
||||
if sidecar.exists():
|
||||
try:
|
||||
shutil.copy2(sidecar, candidate.with_name(candidate.name + suffix))
|
||||
except OSError:
|
||||
pass
|
||||
return candidate
|
||||
|
||||
|
||||
def _guard_existing_db_is_healthy(path: Path) -> None:
|
||||
"""Run ``PRAGMA integrity_check`` on an existing non-empty DB file.
|
||||
|
||||
Opens the probe in read/write mode so SQLite can recover or
|
||||
checkpoint a healthy WAL/hot-journal DB before we declare it
|
||||
corrupt. If the file is malformed, copy it (and any WAL/SHM
|
||||
sidecars) to a timestamped backup and raise
|
||||
:class:`KanbanDbCorruptError` so callers cannot silently recreate
|
||||
the schema on top of a damaged DB.
|
||||
|
||||
Transient lock/busy errors (``sqlite3.OperationalError``) are NOT
|
||||
treated as corruption; they propagate raw so the caller sees a
|
||||
normal lock failure and no spurious ``.corrupt`` backup is made.
|
||||
|
||||
No-op for missing files, zero-byte files (treated as fresh), and
|
||||
paths already proven healthy this process (cache hit).
|
||||
"""
|
||||
try:
|
||||
if not path.exists() or path.stat().st_size == 0:
|
||||
return
|
||||
except OSError:
|
||||
return
|
||||
if str(path.resolve()) in _INITIALIZED_PATHS:
|
||||
return
|
||||
reason: Optional[str] = None
|
||||
try:
|
||||
probe = sqlite3.connect(str(path), timeout=5, isolation_level=None)
|
||||
try:
|
||||
row = probe.execute("PRAGMA integrity_check").fetchone()
|
||||
finally:
|
||||
probe.close()
|
||||
if not row or (row[0] or "").lower() != "ok":
|
||||
reason = f"integrity_check returned {row[0] if row else '<no row>'!r}"
|
||||
except sqlite3.OperationalError:
|
||||
# Lock contention, busy, transient IO — not corruption. Let it propagate.
|
||||
raise
|
||||
except sqlite3.DatabaseError as exc:
|
||||
reason = f"sqlite refused to open file: {exc}"
|
||||
if reason is None:
|
||||
return
|
||||
backup = _backup_corrupt_db(path)
|
||||
raise KanbanDbCorruptError(path, backup, reason)
|
||||
|
||||
|
||||
def connect(
|
||||
db_path: Optional[Path] = None,
|
||||
*,
|
||||
|
|
@ -1033,7 +1126,13 @@ def connect(
|
|||
else:
|
||||
path = kanban_db_path(board=board)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# Cheap byte-level check first — catches the #29507 TLS-overwrite shape
|
||||
# and other invalid-header cases without opening a sqlite connection.
|
||||
_validate_sqlite_header(path)
|
||||
# Full integrity probe — catches corruption past the header (malformed
|
||||
# pages, broken internal metadata). Cached per-path after first success
|
||||
# via _INITIALIZED_PATHS so it only runs once per process per path.
|
||||
_guard_existing_db_is_healthy(path)
|
||||
resolved = str(path.resolve())
|
||||
conn = sqlite3.connect(str(path), isolation_level=None, timeout=30)
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -2981,3 +2981,104 @@ def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch):
|
|||
assert "stale" in kinds, (
|
||||
f"Expected 'stale' event in task_events; got {kinds!r}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Corruption guard (issue #30687)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _write_corrupt_db(path: Path) -> bytes:
|
||||
"""Write a kanban DB with a VALID SQLite header but malformed page content.
|
||||
|
||||
This is the corruption shape the integrity guard specifically targets
|
||||
(e.g. issue #29507 follow-up reports where the file's first 16 bytes
|
||||
pass the header byte check but ``PRAGMA integrity_check`` then fails
|
||||
because the internal pages are damaged). It's what main's header-only
|
||||
validator was letting through, and what this PR adds the full guard
|
||||
for.
|
||||
"""
|
||||
# 100-byte SQLite header (magic + minimal valid-looking fields) so the
|
||||
# cheap header check passes, then deliberate garbage so sqlite refuses
|
||||
# to read the file past the header.
|
||||
header = b"SQLite format 3\x00" + b"\x10\x00\x02\x02\x00\x40\x20\x20"
|
||||
header += b"\x00\x00\x00\x0c\x00\x00\x23\x46\x00\x00\x00\x00"
|
||||
header = header.ljust(100, b"\x00")
|
||||
payload = b"definitely not a valid sqlite page \x00\x01\x02\x03" * 64
|
||||
blob = header + payload
|
||||
path.write_bytes(blob)
|
||||
return blob
|
||||
|
||||
|
||||
def test_init_db_refuses_corrupt_existing_file(tmp_path):
|
||||
db_path = tmp_path / "kanban.db"
|
||||
original = _write_corrupt_db(db_path)
|
||||
# Ensure the cache doesn't mask the guard.
|
||||
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
|
||||
|
||||
with pytest.raises(kb.KanbanDbCorruptError) as excinfo:
|
||||
kb.init_db(db_path=db_path)
|
||||
|
||||
err = excinfo.value
|
||||
assert err.db_path == db_path
|
||||
assert err.backup_path is not None
|
||||
assert err.backup_path.exists()
|
||||
assert err.backup_path.read_bytes() == original
|
||||
# Original bytes untouched — no schema was written on top.
|
||||
assert db_path.read_bytes() == original
|
||||
assert str(db_path) in str(err)
|
||||
assert str(err.backup_path) in str(err)
|
||||
|
||||
|
||||
def test_connect_refuses_corrupt_existing_file(tmp_path):
|
||||
db_path = tmp_path / "kanban.db"
|
||||
_write_corrupt_db(db_path)
|
||||
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
|
||||
|
||||
with pytest.raises(kb.KanbanDbCorruptError):
|
||||
kb.connect(db_path=db_path)
|
||||
|
||||
|
||||
def test_locked_healthy_db_does_not_classify_as_corrupt(tmp_path, monkeypatch):
|
||||
"""A transient lock during the probe must not produce a .corrupt backup
|
||||
and must not be reported as :class:`KanbanDbCorruptError`. Raw sqlite
|
||||
``OperationalError`` (lock/busy) is acceptable and expected."""
|
||||
db_path = tmp_path / "kanban.db"
|
||||
kb.init_db(db_path=db_path)
|
||||
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
|
||||
|
||||
real_connect = sqlite3.connect
|
||||
|
||||
def flaky_connect(*args, **kwargs):
|
||||
# First call is the integrity probe — simulate a lock.
|
||||
raise sqlite3.OperationalError("database is locked")
|
||||
|
||||
monkeypatch.setattr(kb.sqlite3, "connect", flaky_connect)
|
||||
|
||||
with pytest.raises(sqlite3.OperationalError):
|
||||
kb.connect(db_path=db_path)
|
||||
|
||||
# No .corrupt backup may be produced for a healthy-but-locked DB.
|
||||
backups = list(tmp_path.glob("*.corrupt.*"))
|
||||
assert backups == [], f"unexpected corrupt backups: {backups}"
|
||||
|
||||
# And once the lock clears, normal access still works.
|
||||
monkeypatch.setattr(kb.sqlite3, "connect", real_connect)
|
||||
with kb.connect(db_path=db_path) as conn:
|
||||
kb.create_task(conn, title="still here")
|
||||
titles = [t.title for t in kb.list_tasks(conn)]
|
||||
assert "still here" in titles
|
||||
|
||||
|
||||
def test_init_db_allows_missing_then_healthy(tmp_path):
|
||||
db_path = tmp_path / "fresh.db"
|
||||
assert not db_path.exists()
|
||||
kb.init_db(db_path=db_path)
|
||||
assert db_path.exists() and db_path.stat().st_size > 0
|
||||
|
||||
# Idempotent on a healthy DB: data survives a second init.
|
||||
with kb.connect(db_path=db_path) as conn:
|
||||
kb.create_task(conn, title="keeps")
|
||||
kb.init_db(db_path=db_path)
|
||||
with kb.connect(db_path=db_path) as conn:
|
||||
tasks = kb.list_tasks(conn)
|
||||
assert [t.title for t in tasks] == ["keeps"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue