fix(curator): stop the rollback safety snapshot from pruning its target

Rolling back to the oldest curator snapshot failed and deleted that
snapshot. rollback() takes a safety snapshot first, and snapshot_skills()
ends by pruning the backups directory down to keep (5 by default). At the
steady keep limit that prune removed the oldest snapshot, which is the very
one being restored, so the extract found no skills.tar.gz and the rollback
stopped with "snapshot extract failed (state restored)".

Thread an optional protect set through snapshot_skills() into _prune_old()
so the pre rollback safety snapshot can never evict the snapshot being
restored. Add two regression tests covering restore of the oldest snapshot
at the keep limit.

Fixes #47612
This commit is contained in:
Max Freedom Pollard 2026-06-16 20:59:51 -07:00 committed by Teknium
parent 7bbffceb9c
commit fc1119ca66
2 changed files with 82 additions and 7 deletions

View file

@ -46,7 +46,7 @@ import shutil
import tarfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Set, Tuple
from hermes_constants import get_hermes_home
from agent.skill_utils import is_excluded_skill_path
@ -208,13 +208,17 @@ def _write_manifest(dest: Path, reason: str, archive_path: Path,
)
def snapshot_skills(reason: str = "manual") -> Optional[Path]:
def snapshot_skills(reason: str = "manual", *, protect_ids: Optional[Set[str]] = None) -> Optional[Path]:
"""Create a tar.gz snapshot of ``~/.hermes/skills/`` and prune old ones.
Returns the snapshot directory path, or ``None`` if the snapshot was
skipped (backup disabled, skills dir missing, or an IO error occurred
in which case we log at debug and return None so the curator never
aborts a pass because of a backup failure).
``protect_ids`` is forwarded to the prune step so callers can guarantee
specific snapshot ids survive even when they fall outside the keep
window (rollback passes the id it is about to restore from).
"""
if not is_enabled():
logger.debug("Curator backup disabled by config; skipping snapshot")
@ -276,15 +280,19 @@ def snapshot_skills(reason: str = "manual") -> Optional[Path]:
pass
return None
_prune_old(keep=get_keep())
_prune_old(keep=get_keep(), protect=protect_ids)
logger.info("Curator snapshot created: %s (%s)", snap_id, reason)
return dest
def _prune_old(keep: int) -> List[str]:
def _prune_old(keep: int, protect: Optional[Set[str]] = None) -> List[str]:
"""Delete regular snapshots beyond the newest *keep*. Returns deleted
ids. Staging dirs (``.rollback-staging-*``) are implementation detail
and pruned independently on every call."""
ids. Snapshot ids in *protect* are never deleted even when they fall
outside the keep window rollback() uses this so the mandatory
pre-rollback safety snapshot can never evict the very snapshot being
restored. Staging dirs (``.rollback-staging-*``) are implementation
detail and pruned independently on every call."""
protect = protect or set()
backups = _backups_dir()
if not backups.exists():
return []
@ -305,6 +313,8 @@ def _prune_old(keep: int) -> List[str]:
entries.sort(key=lambda t: t[0], reverse=True)
deleted: List[str] = []
for _, path in entries[keep:]:
if path.name in protect:
continue
try:
shutil.rmtree(path)
deleted.append(path.name)
@ -564,7 +574,13 @@ def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path]
# out before touching anything — otherwise a failed extract could leave
# the user with no skills.
try:
snapshot_skills(reason=f"pre-rollback to {target.name}")
# Protect the target from this snapshot's prune step: at the steady
# keep limit, pruning the oldest snapshot would otherwise delete the
# very snapshot we are about to extract from.
snapshot_skills(
reason=f"pre-rollback to {target.name}",
protect_ids={target.name},
)
except Exception as e:
return (False, f"pre-rollback safety snapshot failed: {e}", None)

View file

@ -592,3 +592,62 @@ def test_restore_cron_skill_links_standalone(backup_env):
assert report["restored"][0]["to"]["skills"] == ["narrow-a", "narrow-b"]
assert len(report["skipped_missing"]) == 1
assert report["skipped_missing"][0]["job_id"] == "job-gone"
# ---------------------------------------------------------------------------
# Rollback must not let the pre-rollback safety snapshot prune the target
# (regression: restoring the oldest snapshot at the keep limit destroyed it)
# ---------------------------------------------------------------------------
def _three_ordered_snapshots(cb, skills, monkeypatch):
"""Create snapshots 05-01 / 05-02 / 05-03 capturing growing trees, with
keep=3 so the backups dir is exactly at the retention limit. 05-01 holds
only 'pristine'; later snapshots add 'extra2' and 'extra3'. Leaves
_utc_id patched to a newest id so the rollback safety snapshot sorts
last. Returns the oldest snapshot id."""
monkeypatch.setattr(cb, "get_keep", lambda: 3)
plan = [
("2026-05-01T00-00-00Z", ["pristine"]),
("2026-05-02T00-00-00Z", ["pristine", "extra2"]),
("2026-05-03T00-00-00Z", ["pristine", "extra2", "extra3"]),
]
for snap_id, names in plan:
for n in names:
_write_skill(skills, n)
monkeypatch.setattr(cb, "_utc_id", lambda now=None, _i=snap_id: _i)
assert cb.snapshot_skills(reason=snap_id) is not None
monkeypatch.setattr(cb, "_utc_id", lambda now=None: "2026-05-09T00-00-00Z")
return "2026-05-01T00-00-00Z"
def test_rollback_to_oldest_snapshot_at_keep_limit_succeeds(backup_env, monkeypatch):
"""Restoring the oldest snapshot when the backups dir is at the keep limit
must succeed: the pre-rollback safety snapshot's prune step must not evict
the snapshot being restored."""
cb = backup_env["cb"]
skills = backup_env["skills"]
oldest = _three_ordered_snapshots(cb, skills, monkeypatch)
ok, msg, _ = cb.rollback(backup_id=oldest)
assert ok is True, f"rollback to oldest snapshot should succeed, got: {msg}"
# 05-01 only contained 'pristine'; a real restore reflects exactly that.
assert (skills / "pristine" / "SKILL.md").exists()
assert not (skills / "extra3").exists(), "tree was not restored to the oldest snapshot"
def test_rollback_does_not_delete_the_snapshot_it_restores_from(backup_env, monkeypatch):
"""The snapshot a rollback restores from must still exist afterwards — the
safety snapshot's prune must never delete the target."""
cb = backup_env["cb"]
skills = backup_env["skills"]
oldest = _three_ordered_snapshots(cb, skills, monkeypatch)
target_dir = skills / ".curator_backups" / oldest
assert target_dir.exists(), "precondition: target snapshot exists before rollback"
cb.rollback(backup_id=oldest)
assert target_dir.exists(), (
"the pre-rollback safety snapshot pruned away the snapshot being "
"restored — the oldest restore point is destroyed by restoring to it"
)