From fc1119ca66e321989a61564aa526b33cb6146d41 Mon Sep 17 00:00:00 2001 From: Max Freedom Pollard Date: Tue, 16 Jun 2026 20:59:51 -0700 Subject: [PATCH] fix(curator): stop the rollback safety snapshot from pruning its target Rolling back to the oldest curator snapshot failed and deleted that snapshot. rollback() takes a safety snapshot first, and snapshot_skills() ends by pruning the backups directory down to keep (5 by default). At the steady keep limit that prune removed the oldest snapshot, which is the very one being restored, so the extract found no skills.tar.gz and the rollback stopped with "snapshot extract failed (state restored)". Thread an optional protect set through snapshot_skills() into _prune_old() so the pre rollback safety snapshot can never evict the snapshot being restored. Add two regression tests covering restore of the oldest snapshot at the keep limit. Fixes #47612 --- agent/curator_backup.py | 30 +++++++++++---- tests/agent/test_curator_backup.py | 59 ++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/agent/curator_backup.py b/agent/curator_backup.py index 944886d729a..ddf8699e9bb 100644 --- a/agent/curator_backup.py +++ b/agent/curator_backup.py @@ -46,7 +46,7 @@ import shutil import tarfile from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple from hermes_constants import get_hermes_home from agent.skill_utils import is_excluded_skill_path @@ -208,13 +208,17 @@ def _write_manifest(dest: Path, reason: str, archive_path: Path, ) -def snapshot_skills(reason: str = "manual") -> Optional[Path]: +def snapshot_skills(reason: str = "manual", *, protect_ids: Optional[Set[str]] = None) -> Optional[Path]: """Create a tar.gz snapshot of ``~/.hermes/skills/`` and prune old ones. Returns the snapshot directory path, or ``None`` if the snapshot was skipped (backup disabled, skills dir missing, or an IO error occurred — in which case we log at debug and return None so the curator never aborts a pass because of a backup failure). + + ``protect_ids`` is forwarded to the prune step so callers can guarantee + specific snapshot ids survive even when they fall outside the keep + window (rollback passes the id it is about to restore from). """ if not is_enabled(): logger.debug("Curator backup disabled by config; skipping snapshot") @@ -276,15 +280,19 @@ def snapshot_skills(reason: str = "manual") -> Optional[Path]: pass return None - _prune_old(keep=get_keep()) + _prune_old(keep=get_keep(), protect=protect_ids) logger.info("Curator snapshot created: %s (%s)", snap_id, reason) return dest -def _prune_old(keep: int) -> List[str]: +def _prune_old(keep: int, protect: Optional[Set[str]] = None) -> List[str]: """Delete regular snapshots beyond the newest *keep*. Returns deleted - ids. Staging dirs (``.rollback-staging-*``) are implementation detail - and pruned independently on every call.""" + ids. Snapshot ids in *protect* are never deleted even when they fall + outside the keep window — rollback() uses this so the mandatory + pre-rollback safety snapshot can never evict the very snapshot being + restored. Staging dirs (``.rollback-staging-*``) are implementation + detail and pruned independently on every call.""" + protect = protect or set() backups = _backups_dir() if not backups.exists(): return [] @@ -305,6 +313,8 @@ def _prune_old(keep: int) -> List[str]: entries.sort(key=lambda t: t[0], reverse=True) deleted: List[str] = [] for _, path in entries[keep:]: + if path.name in protect: + continue try: shutil.rmtree(path) deleted.append(path.name) @@ -564,7 +574,13 @@ def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path] # out before touching anything — otherwise a failed extract could leave # the user with no skills. try: - snapshot_skills(reason=f"pre-rollback to {target.name}") + # Protect the target from this snapshot's prune step: at the steady + # keep limit, pruning the oldest snapshot would otherwise delete the + # very snapshot we are about to extract from. + snapshot_skills( + reason=f"pre-rollback to {target.name}", + protect_ids={target.name}, + ) except Exception as e: return (False, f"pre-rollback safety snapshot failed: {e}", None) diff --git a/tests/agent/test_curator_backup.py b/tests/agent/test_curator_backup.py index b375f98688f..0381be3d314 100644 --- a/tests/agent/test_curator_backup.py +++ b/tests/agent/test_curator_backup.py @@ -592,3 +592,62 @@ def test_restore_cron_skill_links_standalone(backup_env): assert report["restored"][0]["to"]["skills"] == ["narrow-a", "narrow-b"] assert len(report["skipped_missing"]) == 1 assert report["skipped_missing"][0]["job_id"] == "job-gone" + + +# --------------------------------------------------------------------------- +# Rollback must not let the pre-rollback safety snapshot prune the target +# (regression: restoring the oldest snapshot at the keep limit destroyed it) +# --------------------------------------------------------------------------- + +def _three_ordered_snapshots(cb, skills, monkeypatch): + """Create snapshots 05-01 / 05-02 / 05-03 capturing growing trees, with + keep=3 so the backups dir is exactly at the retention limit. 05-01 holds + only 'pristine'; later snapshots add 'extra2' and 'extra3'. Leaves + _utc_id patched to a newest id so the rollback safety snapshot sorts + last. Returns the oldest snapshot id.""" + monkeypatch.setattr(cb, "get_keep", lambda: 3) + plan = [ + ("2026-05-01T00-00-00Z", ["pristine"]), + ("2026-05-02T00-00-00Z", ["pristine", "extra2"]), + ("2026-05-03T00-00-00Z", ["pristine", "extra2", "extra3"]), + ] + for snap_id, names in plan: + for n in names: + _write_skill(skills, n) + monkeypatch.setattr(cb, "_utc_id", lambda now=None, _i=snap_id: _i) + assert cb.snapshot_skills(reason=snap_id) is not None + monkeypatch.setattr(cb, "_utc_id", lambda now=None: "2026-05-09T00-00-00Z") + return "2026-05-01T00-00-00Z" + + +def test_rollback_to_oldest_snapshot_at_keep_limit_succeeds(backup_env, monkeypatch): + """Restoring the oldest snapshot when the backups dir is at the keep limit + must succeed: the pre-rollback safety snapshot's prune step must not evict + the snapshot being restored.""" + cb = backup_env["cb"] + skills = backup_env["skills"] + oldest = _three_ordered_snapshots(cb, skills, monkeypatch) + + ok, msg, _ = cb.rollback(backup_id=oldest) + + assert ok is True, f"rollback to oldest snapshot should succeed, got: {msg}" + # 05-01 only contained 'pristine'; a real restore reflects exactly that. + assert (skills / "pristine" / "SKILL.md").exists() + assert not (skills / "extra3").exists(), "tree was not restored to the oldest snapshot" + + +def test_rollback_does_not_delete_the_snapshot_it_restores_from(backup_env, monkeypatch): + """The snapshot a rollback restores from must still exist afterwards — the + safety snapshot's prune must never delete the target.""" + cb = backup_env["cb"] + skills = backup_env["skills"] + oldest = _three_ordered_snapshots(cb, skills, monkeypatch) + target_dir = skills / ".curator_backups" / oldest + assert target_dir.exists(), "precondition: target snapshot exists before rollback" + + cb.rollback(backup_id=oldest) + + assert target_dir.exists(), ( + "the pre-rollback safety snapshot pruned away the snapshot being " + "restored — the oldest restore point is destroyed by restoring to it" + )