diff --git a/agent/curator_backup.py b/agent/curator_backup.py index 944886d729a..ddf8699e9bb 100644 --- a/agent/curator_backup.py +++ b/agent/curator_backup.py @@ -46,7 +46,7 @@ import shutil import tarfile from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple from hermes_constants import get_hermes_home from agent.skill_utils import is_excluded_skill_path @@ -208,13 +208,17 @@ def _write_manifest(dest: Path, reason: str, archive_path: Path, ) -def snapshot_skills(reason: str = "manual") -> Optional[Path]: +def snapshot_skills(reason: str = "manual", *, protect_ids: Optional[Set[str]] = None) -> Optional[Path]: """Create a tar.gz snapshot of ``~/.hermes/skills/`` and prune old ones. Returns the snapshot directory path, or ``None`` if the snapshot was skipped (backup disabled, skills dir missing, or an IO error occurred — in which case we log at debug and return None so the curator never aborts a pass because of a backup failure). + + ``protect_ids`` is forwarded to the prune step so callers can guarantee + specific snapshot ids survive even when they fall outside the keep + window (rollback passes the id it is about to restore from). """ if not is_enabled(): logger.debug("Curator backup disabled by config; skipping snapshot") @@ -276,15 +280,19 @@ def snapshot_skills(reason: str = "manual") -> Optional[Path]: pass return None - _prune_old(keep=get_keep()) + _prune_old(keep=get_keep(), protect=protect_ids) logger.info("Curator snapshot created: %s (%s)", snap_id, reason) return dest -def _prune_old(keep: int) -> List[str]: +def _prune_old(keep: int, protect: Optional[Set[str]] = None) -> List[str]: """Delete regular snapshots beyond the newest *keep*. Returns deleted - ids. Staging dirs (``.rollback-staging-*``) are implementation detail - and pruned independently on every call.""" + ids. Snapshot ids in *protect* are never deleted even when they fall + outside the keep window — rollback() uses this so the mandatory + pre-rollback safety snapshot can never evict the very snapshot being + restored. Staging dirs (``.rollback-staging-*``) are implementation + detail and pruned independently on every call.""" + protect = protect or set() backups = _backups_dir() if not backups.exists(): return [] @@ -305,6 +313,8 @@ def _prune_old(keep: int) -> List[str]: entries.sort(key=lambda t: t[0], reverse=True) deleted: List[str] = [] for _, path in entries[keep:]: + if path.name in protect: + continue try: shutil.rmtree(path) deleted.append(path.name) @@ -564,7 +574,13 @@ def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path] # out before touching anything — otherwise a failed extract could leave # the user with no skills. try: - snapshot_skills(reason=f"pre-rollback to {target.name}") + # Protect the target from this snapshot's prune step: at the steady + # keep limit, pruning the oldest snapshot would otherwise delete the + # very snapshot we are about to extract from. + snapshot_skills( + reason=f"pre-rollback to {target.name}", + protect_ids={target.name}, + ) except Exception as e: return (False, f"pre-rollback safety snapshot failed: {e}", None) diff --git a/tests/agent/test_curator_backup.py b/tests/agent/test_curator_backup.py index b375f98688f..0381be3d314 100644 --- a/tests/agent/test_curator_backup.py +++ b/tests/agent/test_curator_backup.py @@ -592,3 +592,62 @@ def test_restore_cron_skill_links_standalone(backup_env): assert report["restored"][0]["to"]["skills"] == ["narrow-a", "narrow-b"] assert len(report["skipped_missing"]) == 1 assert report["skipped_missing"][0]["job_id"] == "job-gone" + + +# --------------------------------------------------------------------------- +# Rollback must not let the pre-rollback safety snapshot prune the target +# (regression: restoring the oldest snapshot at the keep limit destroyed it) +# --------------------------------------------------------------------------- + +def _three_ordered_snapshots(cb, skills, monkeypatch): + """Create snapshots 05-01 / 05-02 / 05-03 capturing growing trees, with + keep=3 so the backups dir is exactly at the retention limit. 05-01 holds + only 'pristine'; later snapshots add 'extra2' and 'extra3'. Leaves + _utc_id patched to a newest id so the rollback safety snapshot sorts + last. Returns the oldest snapshot id.""" + monkeypatch.setattr(cb, "get_keep", lambda: 3) + plan = [ + ("2026-05-01T00-00-00Z", ["pristine"]), + ("2026-05-02T00-00-00Z", ["pristine", "extra2"]), + ("2026-05-03T00-00-00Z", ["pristine", "extra2", "extra3"]), + ] + for snap_id, names in plan: + for n in names: + _write_skill(skills, n) + monkeypatch.setattr(cb, "_utc_id", lambda now=None, _i=snap_id: _i) + assert cb.snapshot_skills(reason=snap_id) is not None + monkeypatch.setattr(cb, "_utc_id", lambda now=None: "2026-05-09T00-00-00Z") + return "2026-05-01T00-00-00Z" + + +def test_rollback_to_oldest_snapshot_at_keep_limit_succeeds(backup_env, monkeypatch): + """Restoring the oldest snapshot when the backups dir is at the keep limit + must succeed: the pre-rollback safety snapshot's prune step must not evict + the snapshot being restored.""" + cb = backup_env["cb"] + skills = backup_env["skills"] + oldest = _three_ordered_snapshots(cb, skills, monkeypatch) + + ok, msg, _ = cb.rollback(backup_id=oldest) + + assert ok is True, f"rollback to oldest snapshot should succeed, got: {msg}" + # 05-01 only contained 'pristine'; a real restore reflects exactly that. + assert (skills / "pristine" / "SKILL.md").exists() + assert not (skills / "extra3").exists(), "tree was not restored to the oldest snapshot" + + +def test_rollback_does_not_delete_the_snapshot_it_restores_from(backup_env, monkeypatch): + """The snapshot a rollback restores from must still exist afterwards — the + safety snapshot's prune must never delete the target.""" + cb = backup_env["cb"] + skills = backup_env["skills"] + oldest = _three_ordered_snapshots(cb, skills, monkeypatch) + target_dir = skills / ".curator_backups" / oldest + assert target_dir.exists(), "precondition: target snapshot exists before rollback" + + cb.rollback(backup_id=oldest) + + assert target_dir.exists(), ( + "the pre-rollback safety snapshot pruned away the snapshot being " + "restored — the oldest restore point is destroyed by restoring to it" + )