diff --git a/agent/curator.py b/agent/curator.py index 6858830aac..c93a9c1853 100644 --- a/agent/curator.py +++ b/agent/curator.py @@ -28,7 +28,7 @@ import tempfile import threading from datetime import datetime, timedelta, timezone from pathlib import Path -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, List, Optional, Set from hermes_constants import get_hermes_home from tools import skill_usage @@ -354,6 +354,218 @@ CURATOR_REVIEW_PROMPT = ( ) +# --------------------------------------------------------------------------- +# Per-run reports — {YYYYMMDD-HHMMSS}/run.json + REPORT.md under logs/curator/ +# --------------------------------------------------------------------------- + +def _reports_root() -> Path: + """Directory where curator run reports are written. + + Lives under the profile-aware logs dir (``~/.hermes/logs/curator/``) + alongside ``agent.log`` and ``gateway.log`` so it's found by anyone + looking for operational telemetry, not mixed in with the user's + authored skill data in ``~/.hermes/skills/``. + """ + return get_hermes_home() / "logs" / "curator" + + +def _write_run_report( + *, + started_at: datetime, + elapsed_seconds: float, + auto_counts: Dict[str, int], + auto_summary: str, + before_report: List[Dict[str, Any]], + before_names: Set[str], + after_report: List[Dict[str, Any]], + llm_meta: Dict[str, Any], +) -> Optional[Path]: + """Write run.json + REPORT.md under logs/curator/{YYYYMMDD-HHMMSS}/. + + Returns the report directory path on success, None if the write + couldn't happen (caller logs and continues — reporting is best-effort). + """ + root = _reports_root() + try: + root.mkdir(parents=True, exist_ok=True) + except Exception as e: + logger.debug("Curator report dir create failed: %s", e) + return None + + stamp = started_at.strftime("%Y%m%d-%H%M%S") + run_dir = root / stamp + # If we crash-reran within the same second, append a disambiguator + suffix = 1 + while run_dir.exists(): + suffix += 1 + run_dir = root / f"{stamp}-{suffix}" + try: + run_dir.mkdir(parents=True, exist_ok=False) + except Exception as e: + logger.debug("Curator run dir create failed: %s", e) + return None + + # Diff before/after + after_by_name = {r.get("name"): r for r in after_report if isinstance(r, dict)} + after_names = set(after_by_name.keys()) + removed = sorted(before_names - after_names) # archived during this run + added = sorted(after_names - before_names) # new skills this run + before_by_name = {r.get("name"): r for r in before_report if isinstance(r, dict)} + + # State transitions between the two snapshots (e.g. active -> stale) + transitions: List[Dict[str, str]] = [] + for name in sorted(after_names & before_names): + s_before = (before_by_name.get(name) or {}).get("state") + s_after = (after_by_name.get(name) or {}).get("state") + if s_before and s_after and s_before != s_after: + transitions.append({"name": name, "from": s_before, "to": s_after}) + + # Classify LLM tool calls + tc_counts: Dict[str, int] = {} + for tc in llm_meta.get("tool_calls", []) or []: + name = tc.get("name", "unknown") + tc_counts[name] = tc_counts.get(name, 0) + 1 + + payload = { + "started_at": started_at.isoformat(), + "duration_seconds": round(elapsed_seconds, 2), + "model": llm_meta.get("model", ""), + "provider": llm_meta.get("provider", ""), + "auto_transitions": auto_counts, + "counts": { + "before": len(before_names), + "after": len(after_names), + "delta": len(after_names) - len(before_names), + "archived_this_run": len(removed), + "added_this_run": len(added), + "state_transitions": len(transitions), + "tool_calls_total": sum(tc_counts.values()), + }, + "tool_call_counts": tc_counts, + "archived": removed, + "added": added, + "state_transitions": transitions, + "llm_final": llm_meta.get("final", ""), + "llm_summary": llm_meta.get("summary", ""), + "llm_error": llm_meta.get("error"), + "tool_calls": llm_meta.get("tool_calls", []), + } + + # run.json — machine-readable, full fidelity + try: + (run_dir / "run.json").write_text( + json.dumps(payload, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + except Exception as e: + logger.debug("Curator run.json write failed: %s", e) + + # REPORT.md — human-readable + try: + md = _render_report_markdown(payload) + (run_dir / "REPORT.md").write_text(md, encoding="utf-8") + except Exception as e: + logger.debug("Curator REPORT.md write failed: %s", e) + + return run_dir + + +def _render_report_markdown(p: Dict[str, Any]) -> str: + """Render the human-readable report.""" + lines: List[str] = [] + started = p.get("started_at", "") + duration = p.get("duration_seconds", 0) or 0 + mins, secs = divmod(int(duration), 60) + dur_label = f"{mins}m {secs}s" if mins else f"{secs}s" + + lines.append(f"# Curator run — {started}\n") + model = p.get("model") or "(not resolved)" + prov = p.get("provider") or "(not resolved)" + counts = p.get("counts") or {} + lines.append( + f"Model: `{model}` via `{prov}` · Duration: {dur_label} · " + f"Agent-created skills: {counts.get('before', 0)} → {counts.get('after', 0)} " + f"({counts.get('delta', 0):+d})\n" + ) + + error = p.get("llm_error") + if error: + lines.append(f"> ⚠ LLM pass error: `{error}`\n") + + # Auto-transitions (pure, no LLM) + auto = p.get("auto_transitions") or {} + lines.append("## Auto-transitions (pure, no LLM)\n") + lines.append(f"- checked: {auto.get('checked', 0)}") + lines.append(f"- marked stale: {auto.get('marked_stale', 0)}") + lines.append(f"- archived: {auto.get('archived', 0)}") + lines.append(f"- reactivated: {auto.get('reactivated', 0)}") + lines.append("") + + # LLM pass numbers + tc_counts = p.get("tool_call_counts") or {} + lines.append("## LLM consolidation pass\n") + lines.append(f"- tool calls: **{counts.get('tool_calls_total', 0)}** " + f"(by name: {', '.join(f'{k}={v}' for k, v in sorted(tc_counts.items())) or 'none'})") + lines.append(f"- archived this run: **{counts.get('archived_this_run', 0)}**") + lines.append(f"- new skills this run: **{counts.get('added_this_run', 0)}**") + lines.append(f"- state transitions (active ↔ stale ↔ archived): " + f"**{counts.get('state_transitions', 0)}**") + lines.append("") + + # Archived list + archived = p.get("archived") or [] + if archived: + lines.append(f"### Skills archived ({len(archived)})\n") + lines.append("_Archived skills are at `~/.hermes/skills/.archive/`. " + "Restore any via `hermes curator restore `._\n") + # Show first 50 inline, note truncation after that + SHOW = 50 + for n in archived[:SHOW]: + lines.append(f"- `{n}`") + if len(archived) > SHOW: + lines.append(f"- … and {len(archived) - SHOW} more (see `run.json` for the full list)") + lines.append("") + + # Added list + added = p.get("added") or [] + if added: + lines.append(f"### New skills this run ({len(added)})\n") + lines.append("_Usually these are new class-level umbrellas created via `skill_manage action=create`._\n") + for n in added: + lines.append(f"- `{n}`") + lines.append("") + + # State transitions + trans = p.get("state_transitions") or [] + if trans: + lines.append(f"### State transitions ({len(trans)})\n") + for t in trans: + lines.append(f"- `{t.get('name')}`: {t.get('from')} → {t.get('to')}") + lines.append("") + + # Full LLM final response + final = (p.get("llm_final") or "").strip() + if final: + lines.append("## LLM final summary\n") + lines.append(final) + lines.append("") + elif not error: + llm_sum = p.get("llm_summary") or "" + if llm_sum: + lines.append("## LLM summary\n") + lines.append(llm_sum) + lines.append("") + + # Recovery footer + lines.append("## Recovery\n") + lines.append("- Restore an archived skill: `hermes curator restore `") + lines.append("- All archives live under `~/.hermes/skills/.archive/` and are recoverable by `mv`") + lines.append("- See `run.json` in this directory for the full machine-readable record.") + lines.append("") + + return "\n".join(lines) + + # --------------------------------------------------------------------------- # Orchestrator — spawn a forked AIAgent for the LLM review pass # --------------------------------------------------------------------------- @@ -415,22 +627,72 @@ def run_curator_review( def _llm_pass(): nonlocal auto_summary + # Snapshot skill state BEFORE the LLM pass so the report can diff. + try: + before_report = skill_usage.agent_created_report() + except Exception: + before_report = [] + before_names = {r.get("name") for r in before_report if isinstance(r, dict)} + + llm_meta: Dict[str, Any] = {} try: candidate_list = _render_candidate_list() if "No agent-created skills" in candidate_list: final_summary = f"auto: {auto_summary}; llm: skipped (no candidates)" + llm_meta = { + "final": "", + "summary": "skipped (no candidates)", + "model": "", + "provider": "", + "tool_calls": [], + "error": None, + } else: prompt = f"{CURATOR_REVIEW_PROMPT}\n\n{candidate_list}" - llm_summary = _run_llm_review(prompt) - final_summary = f"auto: {auto_summary}; llm: {llm_summary}" + llm_meta = _run_llm_review(prompt) + final_summary = ( + f"auto: {auto_summary}; llm: {llm_meta.get('summary', 'no change')}" + ) except Exception as e: logger.debug("Curator LLM pass failed: %s", e, exc_info=True) final_summary = f"auto: {auto_summary}; llm: error ({e})" + llm_meta = { + "final": "", + "summary": f"error ({e})", + "model": "", + "provider": "", + "tool_calls": [], + "error": str(e), + } elapsed = (datetime.now(timezone.utc) - start).total_seconds() state2 = load_state() state2["last_run_duration_seconds"] = elapsed state2["last_run_summary"] = final_summary + + # Write the per-run report. Runs in a best-effort try so a + # reporting bug never breaks the curator itself. Report path is + # recorded in state so `hermes curator status` can point at it. + try: + after_report = skill_usage.agent_created_report() + except Exception: + after_report = [] + try: + report_path = _write_run_report( + started_at=start, + elapsed_seconds=elapsed, + auto_counts=counts, + auto_summary=auto_summary, + before_report=before_report, + before_names=before_names, + after_report=after_report, + llm_meta=llm_meta, + ) + if report_path is not None: + state2["last_report_path"] = str(report_path) + except Exception as e: + logger.debug("Curator report write failed: %s", e, exc_info=True) + save_state(state2) if on_summary: @@ -452,14 +714,34 @@ def run_curator_review( } -def _run_llm_review(prompt: str) -> str: - """Spawn an AIAgent fork to run the curator review prompt. Returns a short - summary of what the model said in its final response.""" +def _run_llm_review(prompt: str) -> Dict[str, Any]: + """Spawn an AIAgent fork to run the curator review prompt. + + Returns a dict with: + - final: full (untruncated) final response from the reviewer + - summary: short summary suitable for state file (240-char cap) + - model, provider: what the fork actually ran on + - tool_calls: list of {name, arguments} for every tool call made during + the pass (arguments may be truncated for readability) + - error: set if the pass failed mid-run; final/summary may still be empty + + Never raises; callers get a structured failure instead. + """ import contextlib + result_meta: Dict[str, Any] = { + "final": "", + "summary": "", + "model": "", + "provider": "", + "tool_calls": [], + "error": None, + } try: from run_agent import AIAgent except Exception as e: - return f"AIAgent import failed: {e}" + result_meta["error"] = f"AIAgent import failed: {e}" + result_meta["summary"] = result_meta["error"] + return result_meta # Resolve provider + model the same way the CLI does, so the curator # fork inherits the user's active main config rather than falling @@ -489,6 +771,9 @@ def _run_llm_review(prompt: str) -> str: except Exception as e: logger.debug("Curator provider resolution failed: %s", e, exc_info=True) + result_meta["model"] = _model_name + result_meta["provider"] = _resolved_provider or "" + review_agent = None try: review_agent = AIAgent( @@ -520,20 +805,43 @@ def _run_llm_review(prompt: str) -> str: with open(os.devnull, "w") as _devnull, \ contextlib.redirect_stdout(_devnull), \ contextlib.redirect_stderr(_devnull): - result = review_agent.run_conversation(user_message=prompt) + conv_result = review_agent.run_conversation(user_message=prompt) final = "" - if isinstance(result, dict): - final = str(result.get("final_response") or "").strip() - return (final[:240] + "…") if len(final) > 240 else (final or "no change") + if isinstance(conv_result, dict): + final = str(conv_result.get("final_response") or "").strip() + result_meta["final"] = final + result_meta["summary"] = (final[:240] + "…") if len(final) > 240 else (final or "no change") + + # Collect tool calls for the report. Walk the forked agent's + # session messages and extract every tool_call made during the + # pass. Truncate argument payloads so a giant skill_manage create + # doesn't blow up the report. + _calls: List[Dict[str, Any]] = [] + for msg in getattr(review_agent, "_session_messages", []) or []: + if not isinstance(msg, dict): + continue + tcs = msg.get("tool_calls") or [] + for tc in tcs: + if not isinstance(tc, dict): + continue + fn = tc.get("function") or {} + name = fn.get("name") or "" + args_raw = fn.get("arguments") or "" + if isinstance(args_raw, str) and len(args_raw) > 400: + args_raw = args_raw[:400] + "…" + _calls.append({"name": name, "arguments": args_raw}) + result_meta["tool_calls"] = _calls except Exception as e: - return f"error: {e}" + result_meta["error"] = f"error: {e}" + result_meta["summary"] = result_meta["error"] finally: if review_agent is not None: try: review_agent.close() except Exception: pass + return result_meta # --------------------------------------------------------------------------- diff --git a/hermes_cli/curator.py b/hermes_cli/curator.py index f580005794..a8bbcbafb0 100644 --- a/hermes_cli/curator.py +++ b/hermes_cli/curator.py @@ -55,6 +55,9 @@ def _cmd_status(args) -> int: print(f" runs: {runs}") print(f" last run: {_fmt_ts(last_run)}") print(f" last summary: {summary}") + _report = state.get("last_report_path") + if _report: + print(f" last report: {_report}") _ih = curator.get_interval_hours() _interval_label = ( f"{_ih // 24}d" if _ih % 24 == 0 and _ih >= 24 diff --git a/tests/agent/test_curator.py b/tests/agent/test_curator.py index a8a4b5ada3..c9118d1ad5 100644 --- a/tests/agent/test_curator.py +++ b/tests/agent/test_curator.py @@ -271,10 +271,17 @@ def test_run_review_synchronous_invokes_llm_stub(curator_env, monkeypatch): _write_skill(skills_dir, "a") calls = [] - monkeypatch.setattr( - c, "_run_llm_review", - lambda prompt: (calls.append(prompt), "stubbed-summary")[1], - ) + def _stub(prompt): + calls.append(prompt) + return { + "final": "stubbed-summary", + "summary": "stubbed-summary", + "model": "stub-model", + "provider": "stub-provider", + "tool_calls": [], + "error": None, + } + monkeypatch.setattr(c, "_run_llm_review", _stub) captured = [] c.run_curator_review(on_summary=lambda s: captured.append(s), synchronous=True) diff --git a/tests/agent/test_curator_reports.py b/tests/agent/test_curator_reports.py new file mode 100644 index 0000000000..3c94c231c1 --- /dev/null +++ b/tests/agent/test_curator_reports.py @@ -0,0 +1,258 @@ +"""Tests for the curator per-run report writer (run.json + REPORT.md). + +Reports live under ``~/.hermes/logs/curator/{YYYYMMDD-HHMMSS}/`` alongside +the standard log dir, not inside the user's ``skills/`` data directory. +""" + +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone, timedelta +from pathlib import Path + +import pytest + + +@pytest.fixture +def curator_env(tmp_path, monkeypatch): + """Isolated HERMES_HOME with a skills/ dir + reset curator module state.""" + home = tmp_path / ".hermes" + home.mkdir() + (home / "skills").mkdir() + (home / "logs").mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + monkeypatch.setattr(Path, "home", lambda: tmp_path) + + import importlib + import hermes_constants + importlib.reload(hermes_constants) + from agent import curator + importlib.reload(curator) + from tools import skill_usage + importlib.reload(skill_usage) + yield {"home": home, "curator": curator, "skill_usage": skill_usage} + + +def _make_llm_meta(**overrides): + base = { + "final": "short summary of the pass", + "summary": "short summary", + "model": "test-model", + "provider": "test-provider", + "tool_calls": [], + "error": None, + } + base.update(overrides) + return base + + +def test_reports_root_is_under_logs_not_skills(curator_env): + """Reports live in logs/curator/, not skills/ — operational telemetry + belongs with the logs, not with user-authored skill data.""" + curator = curator_env["curator"] + root = curator._reports_root() + home = curator_env["home"] + # Must be under logs/ + assert root == home / "logs" / "curator" + # Must NOT be under skills/ + assert "skills" not in root.parts + + +def test_write_run_report_creates_both_files(curator_env): + """Each run writes both a run.json (machine) and a REPORT.md (human).""" + curator = curator_env["curator"] + start = datetime.now(timezone.utc) + + run_dir = curator._write_run_report( + started_at=start, + elapsed_seconds=12.345, + auto_counts={"checked": 5, "marked_stale": 1, "archived": 0, "reactivated": 0}, + auto_summary="1 marked stale", + before_report=[], + before_names=set(), + after_report=[], + llm_meta=_make_llm_meta(), + ) + assert run_dir is not None + assert run_dir.is_dir() + assert (run_dir / "run.json").exists() + assert (run_dir / "REPORT.md").exists() + + # The directory name is a timestamp under logs/curator/ + assert run_dir.parent == curator._reports_root() + + +def test_run_json_has_expected_shape(curator_env): + """run.json must carry the machine-readable fields downstream tooling needs.""" + curator = curator_env["curator"] + start = datetime.now(timezone.utc) + + before_report = [ + {"name": "old-thing", "state": "active", "pinned": False}, + {"name": "keeper", "state": "active", "pinned": True}, + ] + after_report = [ + {"name": "keeper", "state": "active", "pinned": True}, + {"name": "new-umbrella", "state": "active", "pinned": False}, + ] + + run_dir = curator._write_run_report( + started_at=start, + elapsed_seconds=42.0, + auto_counts={"checked": 2, "marked_stale": 0, "archived": 0, "reactivated": 0}, + auto_summary="no changes", + before_report=before_report, + before_names={r["name"] for r in before_report}, + after_report=after_report, + llm_meta=_make_llm_meta( + final="I consolidated the whole universe.", + tool_calls=[ + {"name": "skills_list", "arguments": "{}"}, + {"name": "skill_manage", "arguments": '{"action":"create"}'}, + {"name": "terminal", "arguments": "mv ..."}, + ], + ), + ) + payload = json.loads((run_dir / "run.json").read_text()) + + # top-level shape + for k in ( + "started_at", "duration_seconds", "model", "provider", + "auto_transitions", "counts", "tool_call_counts", + "archived", "added", "state_transitions", + "llm_final", "llm_summary", "llm_error", "tool_calls", + ): + assert k in payload, f"missing key: {k}" + + # Diff logic + assert payload["archived"] == ["old-thing"] + assert payload["added"] == ["new-umbrella"] + # Counts reflect the diff + assert payload["counts"]["before"] == 2 + assert payload["counts"]["after"] == 2 + assert payload["counts"]["archived_this_run"] == 1 + assert payload["counts"]["added_this_run"] == 1 + # Tool call counts are aggregated + assert payload["tool_call_counts"]["skills_list"] == 1 + assert payload["tool_call_counts"]["skill_manage"] == 1 + assert payload["tool_call_counts"]["terminal"] == 1 + assert payload["counts"]["tool_calls_total"] == 3 + + +def test_report_md_is_human_readable(curator_env): + """REPORT.md should be a valid markdown doc with the key sections visible.""" + curator = curator_env["curator"] + start = datetime.now(timezone.utc) + + run_dir = curator._write_run_report( + started_at=start, + elapsed_seconds=75.0, + auto_counts={"checked": 10, "marked_stale": 2, "archived": 1, "reactivated": 0}, + auto_summary="2 marked stale, 1 archived", + before_report=[{"name": "foo", "state": "active", "pinned": False}], + before_names={"foo"}, + after_report=[{"name": "foo-umbrella", "state": "active", "pinned": False}], + llm_meta=_make_llm_meta( + final="Consolidated foo-like skills into foo-umbrella.", + model="claude-opus-4.7", + provider="openrouter", + ), + ) + md = (run_dir / "REPORT.md").read_text() + + # Structural checks + assert "# Curator run" in md + assert "Auto-transitions" in md + assert "LLM consolidation pass" in md + assert "Recovery" in md + + # The model / provider we passed in show up + assert "claude-opus-4.7" in md + assert "openrouter" in md + + # The added/archived lists are present + assert "Skills archived" in md + assert "`foo`" in md + assert "New skills this run" in md + assert "`foo-umbrella`" in md + + # The full LLM final response is included verbatim (no 240-char truncation) + assert "Consolidated foo-like skills into foo-umbrella." in md + + +def test_same_second_reruns_get_unique_dirs(curator_env): + """If the curator somehow runs twice in the same second, the second + report still gets its own directory rather than overwriting the first.""" + curator = curator_env["curator"] + start = datetime(2026, 4, 29, 5, 33, 34, tzinfo=timezone.utc) + + kwargs = dict( + started_at=start, + elapsed_seconds=1.0, + auto_counts={"checked": 0, "marked_stale": 0, "archived": 0, "reactivated": 0}, + auto_summary="no changes", + before_report=[], + before_names=set(), + after_report=[], + llm_meta=_make_llm_meta(), + ) + a = curator._write_run_report(**kwargs) + b = curator._write_run_report(**kwargs) + assert a != b + assert a is not None and b is not None + # Second dir has a numeric disambiguator suffix + assert b.name.startswith(a.name) + + +def test_report_captures_llm_error_and_continues(curator_env): + """If the LLM pass recorded an error, the report still writes and + surfaces the error prominently.""" + curator = curator_env["curator"] + run_dir = curator._write_run_report( + started_at=datetime.now(timezone.utc), + elapsed_seconds=2.0, + auto_counts={"checked": 0, "marked_stale": 0, "archived": 0, "reactivated": 0}, + auto_summary="no changes", + before_report=[], + before_names=set(), + after_report=[], + llm_meta=_make_llm_meta( + error="HTTP 400: No models provided", + final="", + summary="error", + ), + ) + md = (run_dir / "REPORT.md").read_text() + assert "HTTP 400" in md + payload = json.loads((run_dir / "run.json").read_text()) + assert payload["llm_error"] == "HTTP 400: No models provided" + + +def test_state_transitions_captured_in_report(curator_env): + """When a skill moves active → stale or stale → archived between + before/after snapshots, the report records it.""" + curator = curator_env["curator"] + start = datetime.now(timezone.utc) + + before = [{"name": "getting-old", "state": "active", "pinned": False}] + after = [{"name": "getting-old", "state": "stale", "pinned": False}] + + run_dir = curator._write_run_report( + started_at=start, + elapsed_seconds=1.0, + auto_counts={"checked": 1, "marked_stale": 1, "archived": 0, "reactivated": 0}, + auto_summary="1 marked stale", + before_report=before, + before_names={r["name"] for r in before}, + after_report=after, + llm_meta=_make_llm_meta(), + ) + payload = json.loads((run_dir / "run.json").read_text()) + assert payload["state_transitions"] == [ + {"name": "getting-old", "from": "active", "to": "stale"} + ] + md = (run_dir / "REPORT.md").read_text() + assert "State transitions" in md + assert "getting-old" in md + assert "active → stale" in md