mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 02:11:48 +00:00
* fix(curator): defer first run and add --dry-run preview (#18373) Curator was meant to run 7 days after install, not on the very first gateway tick. On a fresh install (no .curator_state), should_run_now() returned True immediately because last_run_at was None — so the gateway cron ticker fired Curator against a fresh skill library moments after 'hermes update'. Combined with the binary 'agent-created' provenance model (anything not bundled and not hub-installed), this consolidated hand-authored user workflow skills without consent. Changes: - should_run_now(): first observation seeds last_run_at='now' and returns False. The next real pass fires one full interval_hours later (7 days by default), matching the original design intent. - hermes curator run --dry-run: produces the same review report without applying automatic transitions OR permitting the LLM to call skill_manage / terminal mv. A DRY-RUN banner is prepended to the prompt and the caller skips apply_automatic_transitions. State is NOT advanced so a preview doesn't defer the next scheduled real pass. - hermes update: prints a one-liner on fresh installs pointing at --dry-run, pause, and the docs. Silent on steady state. - Docs: curator.md and cli-commands.md explain the deferred first-run behavior and warn that hand-written SKILL.md files share the 'agent-created' bucket, with guidance to pin or preview before the first pass. Tests: - test_first_run_defers replaces the old 'first run always eligible' assertion — same fixture, inverted expectation. - test_maybe_run_curator_defers_on_fresh_install covers the gateway tick path end-to-end. - Three new dry-run tests cover state-advance suppression, prompt banner injection, and apply_automatic_transitions skipping. Fixes #18373. * feat(curator): pre-run backup + rollback (#18373) Every real curator pass now snapshots ~/.hermes/skills/ into ~/.hermes/skills/.curator_backups/<utc-iso>/skills.tar.gz before calling apply_automatic_transitions or the LLM review. If a run consolidates or archives something the user didn't want touched, 'hermes curator rollback' restores the tree in one command. Dry-run is skipped — no mutation means no snapshot needed. Changes: - agent/curator_backup.py (new): tar.gz snapshot + safe rollback. The snapshot excludes .curator_backups/ (would recurse) and .hub/ (managed by the skills hub). Extract refuses absolute paths and .. components, and uses tarfile's filter='data' on Python 3.12+. Rollback takes a pre-rollback safety snapshot FIRST, stages the current tree into .rollback-staging-<ts>/ so the extract lands in an empty dir, and cleans the staging dir on success. A failed extract restores the staged contents. - agent/curator.py: run_curator_review() calls curator_backup. snapshot_skills(reason='pre-curator-run') before apply_automatic_ transitions. Best-effort — a failed snapshot logs at debug and the run continues (a transient disk issue shouldn't silently disable curator forever). - hermes_cli/curator.py: new 'hermes curator backup' and 'hermes curator rollback' subcommands. rollback supports --list, --id <ts>, -y. - hermes_cli/config.py: curator.backup.{enabled, keep} config block with sane defaults (enabled=true, keep=5). - Docs: curator.md gets a 'Backups and rollback' section; cli-commands .md table gets the new rows. Tests (new file tests/agent/test_curator_backup.py, 16 cases): - snapshot creates tarball + manifest with correct counts - snapshot excludes .curator_backups/ (recursion guard) and .hub/ - snapshot disabled via config returns None without creating anything - snapshot uniquifies ids within the same second (-01 suffix) - prune honors keep count, newest-first - list_backups + _resolve_backup cover newest-default and unknown-id - rollback restores a deleted skill with content intact - rollback is itself undoable — safety snapshot shows up in list_backups - rollback with no snapshots returns an error - rollback refuses tarballs with absolute paths or .. components - real curator runs take a 'pre-curator-run' snapshot; dry-runs do not All curator tests: 210 passing locally.
440 lines
15 KiB
Python
440 lines
15 KiB
Python
"""Curator snapshot + rollback.
|
|
|
|
A pre-run snapshot of ``~/.hermes/skills/`` (excluding ``.curator_backups/``
|
|
itself) is taken before any mutating curator pass. Snapshots are tar.gz
|
|
files under ``~/.hermes/skills/.curator_backups/<utc-iso>/`` with a
|
|
companion ``manifest.json`` describing the snapshot (reason, time, size,
|
|
counted skill files). Rollback picks a snapshot, moves the current
|
|
``skills/`` tree aside into another snapshot so even the rollback itself
|
|
is undoable, then extracts the chosen snapshot into place.
|
|
|
|
The snapshot does NOT include:
|
|
- ``.curator_backups/`` (would recurse)
|
|
- ``.hub/`` (hub-installed skills — managed by the hub, not us)
|
|
|
|
It DOES include:
|
|
- all SKILL.md files + their directories (``scripts/``, ``references/``,
|
|
``templates/``, ``assets/``)
|
|
- ``.usage.json`` (usage telemetry — needed to rehydrate state cleanly)
|
|
- ``.archive/`` (so rollback restores previously-archived skills too)
|
|
- ``.curator_state`` (so rolling back also restores the last-run-at
|
|
pointer — otherwise the curator would immediately re-fire on the next
|
|
tick)
|
|
- ``.bundled_manifest`` (so protection markers stay consistent)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import tarfile
|
|
import tempfile
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from hermes_constants import get_hermes_home
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
DEFAULT_KEEP = 5
|
|
|
|
# Entries under skills/ that should NEVER be rolled up into a snapshot.
|
|
# .hub/ is managed by the skills hub; rolling it back would break lockfile
|
|
# invariants. .curator_backups is the backup dir itself — recursion bomb.
|
|
_EXCLUDE_TOP_LEVEL = {".curator_backups", ".hub"}
|
|
|
|
# Snapshot id regex: UTC ISO with colons replaced by dashes so the filename
|
|
# is portable (Windows-safe). An optional ``-NN`` suffix handles two
|
|
# snapshots landing in the same wallclock second.
|
|
_ID_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}Z(-\d{2})?$")
|
|
|
|
|
|
def _backups_dir() -> Path:
|
|
return get_hermes_home() / "skills" / ".curator_backups"
|
|
|
|
|
|
def _skills_dir() -> Path:
|
|
return get_hermes_home() / "skills"
|
|
|
|
|
|
def _utc_id(now: Optional[datetime] = None) -> str:
|
|
"""UTC ISO-ish filesystem-safe timestamp: ``2026-05-01T13-05-42Z``."""
|
|
if now is None:
|
|
now = datetime.now(timezone.utc)
|
|
# isoformat → "2026-05-01T13:05:42.123456+00:00"; strip subseconds and tz.
|
|
s = now.replace(microsecond=0).isoformat()
|
|
if s.endswith("+00:00"):
|
|
s = s[:-6]
|
|
return s.replace(":", "-") + "Z"
|
|
|
|
|
|
def _load_config() -> Dict[str, Any]:
|
|
try:
|
|
from hermes_cli.config import load_config
|
|
cfg = load_config()
|
|
except Exception as e:
|
|
logger.debug("Failed to load config for curator backup: %s", e)
|
|
return {}
|
|
if not isinstance(cfg, dict):
|
|
return {}
|
|
cur = cfg.get("curator") or {}
|
|
if not isinstance(cur, dict):
|
|
return {}
|
|
bk = cur.get("backup") or {}
|
|
return bk if isinstance(bk, dict) else {}
|
|
|
|
|
|
def is_enabled() -> bool:
|
|
"""Default ON — the whole point of the backup is safety by default."""
|
|
return bool(_load_config().get("enabled", True))
|
|
|
|
|
|
def get_keep() -> int:
|
|
cfg = _load_config()
|
|
try:
|
|
n = int(cfg.get("keep", DEFAULT_KEEP))
|
|
except (TypeError, ValueError):
|
|
n = DEFAULT_KEEP
|
|
return max(1, n)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Snapshot
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _count_skill_files(base: Path) -> int:
|
|
try:
|
|
return sum(1 for _ in base.rglob("SKILL.md"))
|
|
except OSError:
|
|
return 0
|
|
|
|
|
|
def _write_manifest(dest: Path, reason: str, archive_path: Path,
|
|
skills_counted: int) -> None:
|
|
manifest = {
|
|
"id": dest.name,
|
|
"reason": reason,
|
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
"archive": archive_path.name,
|
|
"archive_bytes": archive_path.stat().st_size,
|
|
"skill_files": skills_counted,
|
|
}
|
|
(dest / "manifest.json").write_text(
|
|
json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8"
|
|
)
|
|
|
|
|
|
def snapshot_skills(reason: str = "manual") -> Optional[Path]:
|
|
"""Create a tar.gz snapshot of ``~/.hermes/skills/`` and prune old ones.
|
|
|
|
Returns the snapshot directory path, or ``None`` if the snapshot was
|
|
skipped (backup disabled, skills dir missing, or an IO error occurred —
|
|
in which case we log at debug and return None so the curator never
|
|
aborts a pass because of a backup failure).
|
|
"""
|
|
if not is_enabled():
|
|
logger.debug("Curator backup disabled by config; skipping snapshot")
|
|
return None
|
|
|
|
skills = _skills_dir()
|
|
if not skills.exists():
|
|
logger.debug("No ~/.hermes/skills/ directory — nothing to back up")
|
|
return None
|
|
|
|
backups = _backups_dir()
|
|
try:
|
|
backups.mkdir(parents=True, exist_ok=True)
|
|
except OSError as e:
|
|
logger.debug("Failed to create backups dir %s: %s", backups, e)
|
|
return None
|
|
|
|
# Uniquify: if a snapshot with the same second already exists (can
|
|
# happen if two curator runs fire in the same second), append a short
|
|
# counter. Avoids clobbering and avoids timestamp collisions.
|
|
base_id = _utc_id()
|
|
snap_id = base_id
|
|
counter = 1
|
|
while (backups / snap_id).exists():
|
|
snap_id = f"{base_id}-{counter:02d}"
|
|
counter += 1
|
|
|
|
dest = backups / snap_id
|
|
try:
|
|
dest.mkdir(parents=True, exist_ok=False)
|
|
except OSError as e:
|
|
logger.debug("Failed to create snapshot dir %s: %s", dest, e)
|
|
return None
|
|
|
|
archive = dest / "skills.tar.gz"
|
|
try:
|
|
# Stream into the tarball — no tempdir copy needed.
|
|
with tarfile.open(archive, "w:gz", compresslevel=6) as tf:
|
|
for entry in sorted(skills.iterdir()):
|
|
if entry.name in _EXCLUDE_TOP_LEVEL:
|
|
continue
|
|
# arcname: store paths relative to skills/ so extraction
|
|
# drops cleanly back into the skills dir.
|
|
tf.add(str(entry), arcname=entry.name, recursive=True)
|
|
_write_manifest(dest, reason, archive, _count_skill_files(skills))
|
|
except (OSError, tarfile.TarError) as e:
|
|
logger.debug("Curator snapshot failed: %s", e, exc_info=True)
|
|
# Clean up partial snapshot
|
|
try:
|
|
shutil.rmtree(dest, ignore_errors=True)
|
|
except OSError:
|
|
pass
|
|
return None
|
|
|
|
_prune_old(keep=get_keep())
|
|
logger.info("Curator snapshot created: %s (%s)", snap_id, reason)
|
|
return dest
|
|
|
|
|
|
def _prune_old(keep: int) -> List[str]:
|
|
"""Delete regular snapshots beyond the newest *keep*. Returns deleted
|
|
ids. Staging dirs (``.rollback-staging-*``) are implementation detail
|
|
and pruned independently on every call."""
|
|
backups = _backups_dir()
|
|
if not backups.exists():
|
|
return []
|
|
entries: List[Tuple[str, Path]] = []
|
|
stale_staging: List[Path] = []
|
|
for child in backups.iterdir():
|
|
if not child.is_dir():
|
|
continue
|
|
if child.name.startswith(".rollback-staging-"):
|
|
# Staging dirs are only supposed to exist briefly during a
|
|
# rollback. If we find one here (e.g. from a crashed rollback),
|
|
# clean it up opportunistically.
|
|
stale_staging.append(child)
|
|
continue
|
|
if _ID_RE.match(child.name):
|
|
entries.append((child.name, child))
|
|
# Newest first (lexicographic works because the id is UTC ISO).
|
|
entries.sort(key=lambda t: t[0], reverse=True)
|
|
deleted: List[str] = []
|
|
for _, path in entries[keep:]:
|
|
try:
|
|
shutil.rmtree(path)
|
|
deleted.append(path.name)
|
|
except OSError as e:
|
|
logger.debug("Failed to prune %s: %s", path, e)
|
|
for path in stale_staging:
|
|
try:
|
|
shutil.rmtree(path)
|
|
except OSError as e:
|
|
logger.debug("Failed to clean stale staging dir %s: %s", path, e)
|
|
return deleted
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# List + rollback
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _read_manifest(snap_dir: Path) -> Dict[str, Any]:
|
|
mf = snap_dir / "manifest.json"
|
|
if not mf.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(mf.read_text(encoding="utf-8"))
|
|
except (OSError, json.JSONDecodeError):
|
|
return {}
|
|
|
|
|
|
def list_backups() -> List[Dict[str, Any]]:
|
|
"""Return all restorable snapshots, newest first. Only entries with a
|
|
real ``skills.tar.gz`` tarball are listed — transient
|
|
``.rollback-staging-*`` directories created mid-rollback are
|
|
implementation detail and not shown."""
|
|
backups = _backups_dir()
|
|
if not backups.exists():
|
|
return []
|
|
out: List[Dict[str, Any]] = []
|
|
for child in sorted(backups.iterdir(), reverse=True):
|
|
if not child.is_dir():
|
|
continue
|
|
if not _ID_RE.match(child.name):
|
|
continue
|
|
if not (child / "skills.tar.gz").exists():
|
|
continue
|
|
mf = _read_manifest(child)
|
|
mf.setdefault("id", child.name)
|
|
mf.setdefault("path", str(child))
|
|
if "archive_bytes" not in mf:
|
|
arc = child / "skills.tar.gz"
|
|
try:
|
|
mf["archive_bytes"] = arc.stat().st_size
|
|
except OSError:
|
|
mf["archive_bytes"] = 0
|
|
out.append(mf)
|
|
return out
|
|
|
|
|
|
def _resolve_backup(backup_id: Optional[str]) -> Optional[Path]:
|
|
"""Return the path of the requested backup, or the newest one if
|
|
*backup_id* is None. Returns None if no match."""
|
|
backups = _backups_dir()
|
|
if not backups.exists():
|
|
return None
|
|
if backup_id:
|
|
target = backups / backup_id
|
|
if (
|
|
target.is_dir()
|
|
and _ID_RE.match(backup_id)
|
|
and (target / "skills.tar.gz").exists()
|
|
):
|
|
return target
|
|
return None
|
|
candidates = [
|
|
c for c in sorted(backups.iterdir(), reverse=True)
|
|
if c.is_dir() and _ID_RE.match(c.name) and (c / "skills.tar.gz").exists()
|
|
]
|
|
return candidates[0] if candidates else None
|
|
|
|
|
|
def rollback(backup_id: Optional[str] = None) -> Tuple[bool, str, Optional[Path]]:
|
|
"""Restore ``~/.hermes/skills/`` from a snapshot.
|
|
|
|
Strategy:
|
|
1. Resolve the target snapshot (explicit id or newest regular).
|
|
2. Take a safety snapshot of the CURRENT skills tree under
|
|
``.curator_backups/pre-rollback-<ts>/`` so the rollback itself is
|
|
undoable.
|
|
3. Move all current top-level entries (except ``.curator_backups``
|
|
and ``.hub``) into a tempdir.
|
|
4. Extract the chosen snapshot into ``~/.hermes/skills/``.
|
|
5. On failure during 4, move the tempdir contents back (best-effort)
|
|
and return failure.
|
|
|
|
Returns ``(ok, message, snapshot_path)``.
|
|
"""
|
|
target = _resolve_backup(backup_id)
|
|
if target is None:
|
|
return (
|
|
False,
|
|
f"no matching backup found"
|
|
+ (f" for id '{backup_id}'" if backup_id else "")
|
|
+ " (use `hermes curator rollback --list` to see available snapshots)",
|
|
None,
|
|
)
|
|
archive = target / "skills.tar.gz"
|
|
if not archive.exists():
|
|
return (False, f"snapshot {target.name} has no skills.tar.gz — corrupted?", None)
|
|
|
|
skills = _skills_dir()
|
|
skills.mkdir(parents=True, exist_ok=True)
|
|
backups = _backups_dir()
|
|
backups.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Step 2: safety snapshot of current state FIRST. If this fails we bail
|
|
# out before touching anything — otherwise a failed extract could leave
|
|
# the user with no skills.
|
|
try:
|
|
snapshot_skills(reason=f"pre-rollback to {target.name}")
|
|
except Exception as e:
|
|
return (False, f"pre-rollback safety snapshot failed: {e}", None)
|
|
|
|
# Additionally move current entries into an internal staging dir so
|
|
# the extract happens into an empty skills tree (predictable result).
|
|
# This dir is implementation detail — not listed as a restorable
|
|
# backup. The safety snapshot above is the user-facing undo handle.
|
|
staged = backups / f".rollback-staging-{_utc_id()}"
|
|
try:
|
|
staged.mkdir(parents=True, exist_ok=False)
|
|
except OSError as e:
|
|
return (False, f"failed to create staging dir: {e}", None)
|
|
|
|
moved: List[Tuple[Path, Path]] = []
|
|
try:
|
|
for entry in list(skills.iterdir()):
|
|
if entry.name in _EXCLUDE_TOP_LEVEL:
|
|
continue
|
|
dest = staged / entry.name
|
|
shutil.move(str(entry), str(dest))
|
|
moved.append((entry, dest))
|
|
except OSError as e:
|
|
# Best-effort rollback of the move
|
|
for orig, dest in moved:
|
|
try:
|
|
shutil.move(str(dest), str(orig))
|
|
except OSError:
|
|
pass
|
|
try:
|
|
shutil.rmtree(staged, ignore_errors=True)
|
|
except OSError:
|
|
pass
|
|
return (False, f"failed to stage current skills: {e}", None)
|
|
|
|
# Step 4: extract the snapshot into skills/
|
|
try:
|
|
with tarfile.open(archive, "r:gz") as tf:
|
|
# Python 3.12+ supports filter='data' for safer extraction.
|
|
# Fall back to the unfiltered call for older interpreters but
|
|
# still reject absolute paths and .. components defensively.
|
|
for member in tf.getmembers():
|
|
name = member.name
|
|
if name.startswith("/") or ".." in Path(name).parts:
|
|
raise tarfile.TarError(
|
|
f"refusing to extract unsafe path: {name!r}"
|
|
)
|
|
try:
|
|
tf.extractall(str(skills), filter="data") # type: ignore[call-arg]
|
|
except TypeError:
|
|
# Python < 3.12 — no filter kwarg
|
|
tf.extractall(str(skills))
|
|
except (OSError, tarfile.TarError) as e:
|
|
# Best-effort recover: move staged contents back
|
|
for orig, dest in moved:
|
|
try:
|
|
shutil.move(str(dest), str(orig))
|
|
except OSError:
|
|
pass
|
|
try:
|
|
shutil.rmtree(staged, ignore_errors=True)
|
|
except OSError:
|
|
pass
|
|
return (False, f"snapshot extract failed (state restored): {e}", None)
|
|
|
|
# Extract succeeded — the staging dir has served its purpose. The
|
|
# user's undo handle is the safety snapshot tarball we took earlier.
|
|
try:
|
|
shutil.rmtree(staged, ignore_errors=True)
|
|
except OSError:
|
|
pass
|
|
|
|
logger.info("Curator rollback: restored from %s", target.name)
|
|
return (True, f"restored from snapshot {target.name}", target)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Human-readable summary for CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def format_size(n: int) -> str:
|
|
for unit in ("B", "KB", "MB", "GB"):
|
|
if n < 1024 or unit == "GB":
|
|
return f"{n:.1f} {unit}" if unit != "B" else f"{n} B"
|
|
n /= 1024
|
|
return f"{n:.1f} GB"
|
|
|
|
|
|
def summarize_backups() -> str:
|
|
rows = list_backups()
|
|
if not rows:
|
|
return "No curator snapshots yet."
|
|
lines = [f"{'id':<24} {'reason':<40} {'skills':>6} {'size':>8}"]
|
|
lines.append("─" * len(lines[0]))
|
|
for r in rows:
|
|
lines.append(
|
|
f"{r.get('id','?'):<24} "
|
|
f"{(r.get('reason','?') or '?')[:40]:<40} "
|
|
f"{r.get('skill_files', 0):>6} "
|
|
f"{format_size(int(r.get('archive_bytes', 0))):>8}"
|
|
)
|
|
return "\n".join(lines)
|