mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(kanban): content-addressed corrupt-DB backup filename
Repeated quarantines of an unchanged corrupt kanban.db used to amplify disk usage by N: the gateway dispatcher's 5-minute retry loop, multi- profile fleets sharing one DB, and manual reopen attempts each produced a fresh '.corrupt.<timestamp>.bak' copy of the same bytes. After 10 retries on a 100KB DB you had 11x the disk footprint of duplicate corrupt data. Derive the backup filename from a sha256 of the main DB instead of a timestamp + collision counter. Same bytes → same filename → skip the copy on retries. Different bytes (partial repair, further damage) → different filename → preserve separately. Sidecar (-wal/-shm) backups inherit the same content-addressed name. Inspired by @hanzckernel's PR #33529, simplified down to ~30 LOC: drop the persistent JSON marker file, drop the atomic temp+fsync+rename helper (shutil.copy2 is fine for a quarantine-only path), drop the gateway-side WAL/SHM fingerprint extension (the existing (path, mtime, size) tuple still gives the 5-minute retry semantics it needs), and drop the gateway-side helper extraction. The backup file existing IS the marker; no separate state needed. Test: tests/hermes_cli/test_kanban_db.py::test_repeated_corrupt_open_reuses_single_backup proves 10 retries on the same corrupt bytes produce 1 backup (was 11), and mutating the corrupt bytes produces a second backup with a different fingerprint. Refs #33529 Co-authored-by: hanzckernel <zhicheng.han@mathematik.uni-goettingen.de>
This commit is contained in:
parent
432a691758
commit
6f9182cb34
2 changed files with 67 additions and 22 deletions
|
|
@ -71,6 +71,7 @@ new locking.
|
|||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
|
@ -1138,14 +1139,21 @@ class KanbanDbCorruptError(RuntimeError):
|
|||
|
||||
|
||||
def _backup_corrupt_db(path: Path) -> Optional[Path]:
|
||||
"""Copy a corrupt DB (and its WAL/SHM sidecars) to a timestamped backup.
|
||||
"""Copy a corrupt DB (and its WAL/SHM sidecars) to a content-addressed backup.
|
||||
|
||||
The backup filename is deterministic in the main DB's sha256, so repeated
|
||||
quarantines of the same corrupt bytes (gateway restarts, dispatcher retries,
|
||||
multi-profile fleets all hitting the same shared DB) reuse one backup
|
||||
instead of amplifying disk usage by N. If the corrupt bytes actually
|
||||
change between attempts — e.g. a partial repair or further damage — the
|
||||
fingerprint changes and a separate backup is preserved.
|
||||
|
||||
Returns the backup path of the main DB file, or ``None`` if the copy
|
||||
itself failed (the caller still raises loudly in that case).
|
||||
|
||||
Writes are confined to the original DB's parent directory. The
|
||||
backup basename is derived purely from ``path.name``, never from
|
||||
caller-supplied directory segments — no traversal is possible.
|
||||
Writes are confined to the original DB's parent directory. The backup
|
||||
basename is derived purely from ``path.name`` and a content hash, never
|
||||
from caller-supplied directory segments — no traversal is possible.
|
||||
"""
|
||||
# Resolve once and pin the parent so subsequent path operations cannot
|
||||
# escape it. ``Path.resolve()`` collapses any ``..`` segments and
|
||||
|
|
@ -1153,32 +1161,31 @@ def _backup_corrupt_db(path: Path) -> Optional[Path]:
|
|||
resolved = path.resolve()
|
||||
parent = resolved.parent
|
||||
base_name = resolved.name # basename only
|
||||
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
candidate = parent / f"{base_name}.corrupt.{stamp}.bak"
|
||||
# Defensive: candidate must still be inside parent after construction.
|
||||
# f-string interpolation of ``base_name`` cannot escape ``parent``
|
||||
# because ``base_name`` is itself a resolved basename, but assert it
|
||||
# anyway so static analyzers can see the containment guarantee.
|
||||
if candidate.parent != parent:
|
||||
return None
|
||||
counter = 0
|
||||
while candidate.exists():
|
||||
counter += 1
|
||||
candidate = parent / f"{base_name}.corrupt.{stamp}.{counter}.bak"
|
||||
if candidate.parent != parent:
|
||||
return None
|
||||
digest = hashlib.sha256()
|
||||
try:
|
||||
shutil.copy2(resolved, candidate)
|
||||
with resolved.open("rb") as handle:
|
||||
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
||||
digest.update(chunk)
|
||||
except OSError:
|
||||
return None
|
||||
token = digest.hexdigest()[:16]
|
||||
candidate = parent / f"{base_name}.corrupt.{token}.bak"
|
||||
# Defensive: candidate must still be inside parent after construction.
|
||||
if candidate.parent != parent:
|
||||
return None
|
||||
if not candidate.exists():
|
||||
try:
|
||||
shutil.copy2(resolved, candidate)
|
||||
except OSError:
|
||||
return None
|
||||
for suffix in ("-wal", "-shm"):
|
||||
sidecar = parent / (base_name + suffix)
|
||||
if sidecar.parent != parent or not sidecar.exists():
|
||||
continue
|
||||
sidecar_backup = parent / (candidate.name + suffix)
|
||||
if sidecar_backup.parent != parent or sidecar_backup.exists():
|
||||
continue
|
||||
try:
|
||||
sidecar_backup = parent / (candidate.name + suffix)
|
||||
if sidecar_backup.parent != parent:
|
||||
continue
|
||||
shutil.copy2(sidecar, sidecar_backup)
|
||||
except OSError:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -3317,6 +3317,44 @@ def test_connect_refuses_corrupt_existing_file(tmp_path):
|
|||
kb.connect(db_path=db_path)
|
||||
|
||||
|
||||
def test_repeated_corrupt_open_reuses_single_backup(tmp_path):
|
||||
"""Repeated quarantines of the same corrupt bytes must not amplify disk usage.
|
||||
|
||||
Regression for the gateway dispatcher's 5-min retry loop on shared kanban
|
||||
DBs across multi-profile fleets: each retry on an unchanged corrupt file
|
||||
used to create a fresh ``.corrupt.<timestamp>.bak`` until disk filled. The
|
||||
content-addressed backup name is deterministic in the DB's sha256, so
|
||||
N retries of the same bytes share one backup.
|
||||
"""
|
||||
db_path = tmp_path / "kanban.db"
|
||||
original = _write_corrupt_db(db_path)
|
||||
|
||||
backups: set[Path] = set()
|
||||
for _ in range(10):
|
||||
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
|
||||
with pytest.raises(kb.KanbanDbCorruptError) as excinfo:
|
||||
kb.connect(db_path=db_path)
|
||||
assert excinfo.value.backup_path is not None
|
||||
backups.add(excinfo.value.backup_path)
|
||||
|
||||
assert len(backups) == 1, f"expected 1 deterministic backup, got {len(backups)}"
|
||||
(backup,) = backups
|
||||
assert backup.exists()
|
||||
assert backup.read_bytes() == original
|
||||
|
||||
# Mutate the corrupt bytes — fingerprint changes, separate backup preserved.
|
||||
with db_path.open("r+b") as f:
|
||||
f.seek(4096)
|
||||
f.write(b"\xAB" * 64)
|
||||
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
|
||||
with pytest.raises(kb.KanbanDbCorruptError) as excinfo2:
|
||||
kb.connect(db_path=db_path)
|
||||
second_backup = excinfo2.value.backup_path
|
||||
assert second_backup is not None
|
||||
assert second_backup != backup
|
||||
assert second_backup.exists()
|
||||
|
||||
|
||||
def test_locked_healthy_db_does_not_classify_as_corrupt(tmp_path, monkeypatch):
|
||||
"""A transient lock during the probe must not produce a .corrupt backup
|
||||
and must not be reported as :class:`KanbanDbCorruptError`. Raw sqlite
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue