feat(curator): per-run reports — run.json + REPORT.md under logs/curator/ (#17307)

Every curator pass now emits a dated report directory under `~/.hermes/logs/curator/{YYYYMMDD-HHMMSS}/` with two files: - `run.json` — machine-readable full record (before/after snapshot, state transitions, all tool calls, model/provider, timing, full LLM final response untruncated, error if any) - `REPORT.md` — human-readable markdown: model + duration header, auto-transition counts, LLM consolidation stats, archived-this-run list, new-skills-this-run list, state transitions, the full LLM final summary, and a recovery footer pointing at the archive + the `hermes curator restore` command Reports live under `logs/curator/`, not inside `skills/` — they're operational telemetry, not user-authored skill data, and belong alongside `agent.log` / `gateway.log`. Internals: - `_run_llm_review()` now returns a dict (final, summary, model, provider, tool_calls, error) instead of a bare truncated string so the reporter has full fidelity - Report writer is fully best-effort — any failure logs at DEBUG and never breaks the curator itself. Same-second rerun gets a numeric suffix so reports can't clobber each other - Report path stamped into `.curator_state` as `last_report_path` - `hermes curator status` surfaces a "last report:" line so users can immediately open the latest run Tests (all green): - 7 new tests in tests/agent/test_curator_reports.py covering: report location (logs not skills), both files written, run.json shape and diff accuracy, markdown structure, error path still writes, state transitions captured, same-second runs get unique dirs - Existing test_run_review_synchronous_invokes_llm_stub updated to stub the new dict-returning _run_llm_review signature Live E2E: ran a synchronous pass against a 1-skill test collection with a stubbed LLM; report written correctly, state stamped with last_report_path, markdown human-readable, run.json machine-parseable.
2026-05-04 02:21:47 +00:00 · 2026-04-28 23:23:11 -07:00 · 2026-04-28 23:23:11 -07:00 · bc0d8a941e
commit bc0d8a941e
parent 2d137074a3
4 changed files with 592 additions and 16 deletions
--- a/agent/curator.py
+++ b/agent/curator.py
@ -28,7 +28,7 @@ import tempfile
 import threading
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional, Set

 from hermes_constants import get_hermes_home
 from tools import skill_usage
@ -354,6 +354,218 @@ CURATOR_REVIEW_PROMPT = (
 )


+# ---------------------------------------------------------------------------
+# Per-run reports — {YYYYMMDD-HHMMSS}/run.json + REPORT.md under logs/curator/
+# ---------------------------------------------------------------------------
+
+def _reports_root() -> Path:
+    """Directory where curator run reports are written.
+
+    Lives under the profile-aware logs dir (``~/.hermes/logs/curator/``)
+    alongside ``agent.log`` and ``gateway.log`` so it's found by anyone
+    looking for operational telemetry, not mixed in with the user's
+    authored skill data in ``~/.hermes/skills/``.
+    """
+    return get_hermes_home() / "logs" / "curator"
+
+
+def _write_run_report(
+    *,
+    started_at: datetime,
+    elapsed_seconds: float,
+    auto_counts: Dict[str, int],
+    auto_summary: str,
+    before_report: List[Dict[str, Any]],
+    before_names: Set[str],
+    after_report: List[Dict[str, Any]],
+    llm_meta: Dict[str, Any],
+) -> Optional[Path]:
+    """Write run.json + REPORT.md under logs/curator/{YYYYMMDD-HHMMSS}/.
+
+    Returns the report directory path on success, None if the write
+    couldn't happen (caller logs and continues — reporting is best-effort).
+    """
+    root = _reports_root()
+    try:
+        root.mkdir(parents=True, exist_ok=True)
+    except Exception as e:
+        logger.debug("Curator report dir create failed: %s", e)
+        return None
+
+    stamp = started_at.strftime("%Y%m%d-%H%M%S")
+    run_dir = root / stamp
+    # If we crash-reran within the same second, append a disambiguator
+    suffix = 1
+    while run_dir.exists():
+        suffix += 1
+        run_dir = root / f"{stamp}-{suffix}"
+    try:
+        run_dir.mkdir(parents=True, exist_ok=False)
+    except Exception as e:
+        logger.debug("Curator run dir create failed: %s", e)
+        return None
+
+    # Diff before/after
+    after_by_name = {r.get("name"): r for r in after_report if isinstance(r, dict)}
+    after_names = set(after_by_name.keys())
+    removed = sorted(before_names - after_names)   # archived during this run
+    added = sorted(after_names - before_names)     # new skills this run
+    before_by_name = {r.get("name"): r for r in before_report if isinstance(r, dict)}
+
+    # State transitions between the two snapshots (e.g. active -> stale)
+    transitions: List[Dict[str, str]] = []
+    for name in sorted(after_names & before_names):
+        s_before = (before_by_name.get(name) or {}).get("state")
+        s_after = (after_by_name.get(name) or {}).get("state")
+        if s_before and s_after and s_before != s_after:
+            transitions.append({"name": name, "from": s_before, "to": s_after})
+
+    # Classify LLM tool calls
+    tc_counts: Dict[str, int] = {}
+    for tc in llm_meta.get("tool_calls", []) or []:
+        name = tc.get("name", "unknown")
+        tc_counts[name] = tc_counts.get(name, 0) + 1
+
+    payload = {
+        "started_at": started_at.isoformat(),
+        "duration_seconds": round(elapsed_seconds, 2),
+        "model": llm_meta.get("model", ""),
+        "provider": llm_meta.get("provider", ""),
+        "auto_transitions": auto_counts,
+        "counts": {
+            "before": len(before_names),
+            "after": len(after_names),
+            "delta": len(after_names) - len(before_names),
+            "archived_this_run": len(removed),
+            "added_this_run": len(added),
+            "state_transitions": len(transitions),
+            "tool_calls_total": sum(tc_counts.values()),
+        },
+        "tool_call_counts": tc_counts,
+        "archived": removed,
+        "added": added,
+        "state_transitions": transitions,
+        "llm_final": llm_meta.get("final", ""),
+        "llm_summary": llm_meta.get("summary", ""),
+        "llm_error": llm_meta.get("error"),
+        "tool_calls": llm_meta.get("tool_calls", []),
+    }
+
+    # run.json — machine-readable, full fidelity
+    try:
+        (run_dir / "run.json").write_text(
+            json.dumps(payload, indent=2, ensure_ascii=False) + "\n",
+            encoding="utf-8",
+        )
+    except Exception as e:
+        logger.debug("Curator run.json write failed: %s", e)
+
+    # REPORT.md — human-readable
+    try:
+        md = _render_report_markdown(payload)
+        (run_dir / "REPORT.md").write_text(md, encoding="utf-8")
+    except Exception as e:
+        logger.debug("Curator REPORT.md write failed: %s", e)
+
+    return run_dir
+
+
+def _render_report_markdown(p: Dict[str, Any]) -> str:
+    """Render the human-readable report."""
+    lines: List[str] = []
+    started = p.get("started_at", "")
+    duration = p.get("duration_seconds", 0) or 0
+    mins, secs = divmod(int(duration), 60)
+    dur_label = f"{mins}m {secs}s" if mins else f"{secs}s"
+
+    lines.append(f"# Curator run — {started}\n")
+    model = p.get("model") or "(not resolved)"
+    prov = p.get("provider") or "(not resolved)"
+    counts = p.get("counts") or {}
+    lines.append(
+        f"Model: `{model}` via `{prov}`  ·  Duration: {dur_label}  ·  "
+        f"Agent-created skills: {counts.get('before', 0)} → {counts.get('after', 0)} "
+        f"({counts.get('delta', 0):+d})\n"
+    )
+
+    error = p.get("llm_error")
+    if error:
+        lines.append(f"> ⚠ LLM pass error: `{error}`\n")
+
+    # Auto-transitions (pure, no LLM)
+    auto = p.get("auto_transitions") or {}
+    lines.append("## Auto-transitions (pure, no LLM)\n")
+    lines.append(f"- checked: {auto.get('checked', 0)}")
+    lines.append(f"- marked stale: {auto.get('marked_stale', 0)}")
+    lines.append(f"- archived: {auto.get('archived', 0)}")
+    lines.append(f"- reactivated: {auto.get('reactivated', 0)}")
+    lines.append("")
+
+    # LLM pass numbers
+    tc_counts = p.get("tool_call_counts") or {}
+    lines.append("## LLM consolidation pass\n")
+    lines.append(f"- tool calls: **{counts.get('tool_calls_total', 0)}** "
+                 f"(by name: {', '.join(f'{k}={v}' for k, v in sorted(tc_counts.items())) or 'none'})")
+    lines.append(f"- archived this run: **{counts.get('archived_this_run', 0)}**")
+    lines.append(f"- new skills this run: **{counts.get('added_this_run', 0)}**")
+    lines.append(f"- state transitions (active ↔ stale ↔ archived): "
+                 f"**{counts.get('state_transitions', 0)}**")
+    lines.append("")
+
+    # Archived list
+    archived = p.get("archived") or []
+    if archived:
+        lines.append(f"### Skills archived ({len(archived)})\n")
+        lines.append("_Archived skills are at `~/.hermes/skills/.archive/`. "
+                     "Restore any via `hermes curator restore <name>`._\n")
+        # Show first 50 inline, note truncation after that
+        SHOW = 50
+        for n in archived[:SHOW]:
+            lines.append(f"- `{n}`")
+        if len(archived) > SHOW:
+            lines.append(f"- … and {len(archived) - SHOW} more (see `run.json` for the full list)")
+        lines.append("")
+
+    # Added list
+    added = p.get("added") or []
+    if added:
+        lines.append(f"### New skills this run ({len(added)})\n")
+        lines.append("_Usually these are new class-level umbrellas created via `skill_manage action=create`._\n")
+        for n in added:
+            lines.append(f"- `{n}`")
+        lines.append("")
+
+    # State transitions
+    trans = p.get("state_transitions") or []
+    if trans:
+        lines.append(f"### State transitions ({len(trans)})\n")
+        for t in trans:
+            lines.append(f"- `{t.get('name')}`: {t.get('from')} → {t.get('to')}")
+        lines.append("")
+
+    # Full LLM final response
+    final = (p.get("llm_final") or "").strip()
+    if final:
+        lines.append("## LLM final summary\n")
+        lines.append(final)
+        lines.append("")
+    elif not error:
+        llm_sum = p.get("llm_summary") or ""
+        if llm_sum:
+            lines.append("## LLM summary\n")
+            lines.append(llm_sum)
+            lines.append("")
+
+    # Recovery footer
+    lines.append("## Recovery\n")
+    lines.append("- Restore an archived skill: `hermes curator restore <name>`")
+    lines.append("- All archives live under `~/.hermes/skills/.archive/` and are recoverable by `mv`")
+    lines.append("- See `run.json` in this directory for the full machine-readable record.")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
 # ---------------------------------------------------------------------------
 # Orchestrator — spawn a forked AIAgent for the LLM review pass
 # ---------------------------------------------------------------------------
@ -415,22 +627,72 @@ def run_curator_review(

    def _llm_pass():
        nonlocal auto_summary
+        # Snapshot skill state BEFORE the LLM pass so the report can diff.
+        try:
+            before_report = skill_usage.agent_created_report()
+        except Exception:
+            before_report = []
+        before_names = {r.get("name") for r in before_report if isinstance(r, dict)}
+
+        llm_meta: Dict[str, Any] = {}
        try:
            candidate_list = _render_candidate_list()
            if "No agent-created skills" in candidate_list:
                final_summary = f"auto: {auto_summary}; llm: skipped (no candidates)"
+                llm_meta = {
+                    "final": "",
+                    "summary": "skipped (no candidates)",
+                    "model": "",
+                    "provider": "",
+                    "tool_calls": [],
+                    "error": None,
+                }
            else:
                prompt = f"{CURATOR_REVIEW_PROMPT}\n\n{candidate_list}"
-                llm_summary = _run_llm_review(prompt)
-                final_summary = f"auto: {auto_summary}; llm: {llm_summary}"
+                llm_meta = _run_llm_review(prompt)
+                final_summary = (
+                    f"auto: {auto_summary}; llm: {llm_meta.get('summary', 'no change')}"
+                )
        except Exception as e:
            logger.debug("Curator LLM pass failed: %s", e, exc_info=True)
            final_summary = f"auto: {auto_summary}; llm: error ({e})"
+            llm_meta = {
+                "final": "",
+                "summary": f"error ({e})",
+                "model": "",
+                "provider": "",
+                "tool_calls": [],
+                "error": str(e),
+            }

        elapsed = (datetime.now(timezone.utc) - start).total_seconds()
        state2 = load_state()
        state2["last_run_duration_seconds"] = elapsed
        state2["last_run_summary"] = final_summary
+
+        # Write the per-run report. Runs in a best-effort try so a
+        # reporting bug never breaks the curator itself. Report path is
+        # recorded in state so `hermes curator status` can point at it.
+        try:
+            after_report = skill_usage.agent_created_report()
+        except Exception:
+            after_report = []
+        try:
+            report_path = _write_run_report(
+                started_at=start,
+                elapsed_seconds=elapsed,
+                auto_counts=counts,
+                auto_summary=auto_summary,
+                before_report=before_report,
+                before_names=before_names,
+                after_report=after_report,
+                llm_meta=llm_meta,
+            )
+            if report_path is not None:
+                state2["last_report_path"] = str(report_path)
+        except Exception as e:
+            logger.debug("Curator report write failed: %s", e, exc_info=True)
+
        save_state(state2)

        if on_summary:
@ -452,14 +714,34 @@ def run_curator_review(
    }


-def _run_llm_review(prompt: str) -> str:
-    """Spawn an AIAgent fork to run the curator review prompt. Returns a short
-    summary of what the model said in its final response."""
+def _run_llm_review(prompt: str) -> Dict[str, Any]:
+    """Spawn an AIAgent fork to run the curator review prompt.
+
+    Returns a dict with:
+      - final: full (untruncated) final response from the reviewer
+      - summary: short summary suitable for state file (240-char cap)
+      - model, provider: what the fork actually ran on
+      - tool_calls: list of {name, arguments} for every tool call made during
+        the pass (arguments may be truncated for readability)
+      - error: set if the pass failed mid-run; final/summary may still be empty
+
+    Never raises; callers get a structured failure instead.
+    """
    import contextlib
+    result_meta: Dict[str, Any] = {
+        "final": "",
+        "summary": "",
+        "model": "",
+        "provider": "",
+        "tool_calls": [],
+        "error": None,
+    }
    try:
        from run_agent import AIAgent
    except Exception as e:
-        return f"AIAgent import failed: {e}"
+        result_meta["error"] = f"AIAgent import failed: {e}"
+        result_meta["summary"] = result_meta["error"]
+        return result_meta

    # Resolve provider + model the same way the CLI does, so the curator
    # fork inherits the user's active main config rather than falling
@ -489,6 +771,9 @@ def _run_llm_review(prompt: str) -> str:
    except Exception as e:
        logger.debug("Curator provider resolution failed: %s", e, exc_info=True)

+    result_meta["model"] = _model_name
+    result_meta["provider"] = _resolved_provider or ""
+
    review_agent = None
    try:
        review_agent = AIAgent(
@ -520,20 +805,43 @@ def _run_llm_review(prompt: str) -> str:
        with open(os.devnull, "w") as _devnull, \
             contextlib.redirect_stdout(_devnull), \
             contextlib.redirect_stderr(_devnull):
-            result = review_agent.run_conversation(user_message=prompt)
+            conv_result = review_agent.run_conversation(user_message=prompt)

        final = ""
-        if isinstance(result, dict):
-            final = str(result.get("final_response") or "").strip()
-        return (final[:240] + "…") if len(final) > 240 else (final or "no change")
+        if isinstance(conv_result, dict):
+            final = str(conv_result.get("final_response") or "").strip()
+        result_meta["final"] = final
+        result_meta["summary"] = (final[:240] + "…") if len(final) > 240 else (final or "no change")
+
+        # Collect tool calls for the report. Walk the forked agent's
+        # session messages and extract every tool_call made during the
+        # pass. Truncate argument payloads so a giant skill_manage create
+        # doesn't blow up the report.
+        _calls: List[Dict[str, Any]] = []
+        for msg in getattr(review_agent, "_session_messages", []) or []:
+            if not isinstance(msg, dict):
+                continue
+            tcs = msg.get("tool_calls") or []
+            for tc in tcs:
+                if not isinstance(tc, dict):
+                    continue
+                fn = tc.get("function") or {}
+                name = fn.get("name") or ""
+                args_raw = fn.get("arguments") or ""
+                if isinstance(args_raw, str) and len(args_raw) > 400:
+                    args_raw = args_raw[:400] + "…"
+                _calls.append({"name": name, "arguments": args_raw})
+        result_meta["tool_calls"] = _calls
    except Exception as e:
-        return f"error: {e}"
+        result_meta["error"] = f"error: {e}"
+        result_meta["summary"] = result_meta["error"]
    finally:
        if review_agent is not None:
            try:
                review_agent.close()
            except Exception:
                pass
+    return result_meta


 # ---------------------------------------------------------------------------
--- a/hermes_cli/curator.py
+++ b/hermes_cli/curator.py
@ -55,6 +55,9 @@ def _cmd_status(args) -> int:
    print(f"  runs:           {runs}")
    print(f"  last run:       {_fmt_ts(last_run)}")
    print(f"  last summary:   {summary}")
+    _report = state.get("last_report_path")
+    if _report:
+        print(f"  last report:    {_report}")
    _ih = curator.get_interval_hours()
    _interval_label = (
        f"{_ih // 24}d" if _ih % 24 == 0 and _ih >= 24
--- a/tests/agent/test_curator.py
+++ b/tests/agent/test_curator.py
@ -271,10 +271,17 @@ def test_run_review_synchronous_invokes_llm_stub(curator_env, monkeypatch):
    _write_skill(skills_dir, "a")

    calls = []
-    monkeypatch.setattr(
-        c, "_run_llm_review",
-        lambda prompt: (calls.append(prompt), "stubbed-summary")[1],
-    )
+    def _stub(prompt):
+        calls.append(prompt)
+        return {
+            "final": "stubbed-summary",
+            "summary": "stubbed-summary",
+            "model": "stub-model",
+            "provider": "stub-provider",
+            "tool_calls": [],
+            "error": None,
+        }
+    monkeypatch.setattr(c, "_run_llm_review", _stub)

    captured = []
    c.run_curator_review(on_summary=lambda s: captured.append(s), synchronous=True)
--- a/tests/agent/test_curator_reports.py
+++ b/tests/agent/test_curator_reports.py
@ -0,0 +1,258 @@
+"""Tests for the curator per-run report writer (run.json + REPORT.md).
+
+Reports live under ``~/.hermes/logs/curator/{YYYYMMDD-HHMMSS}/`` alongside
+the standard log dir, not inside the user's ``skills/`` data directory.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture
+def curator_env(tmp_path, monkeypatch):
+    """Isolated HERMES_HOME with a skills/ dir + reset curator module state."""
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    (home / "skills").mkdir()
+    (home / "logs").mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+    monkeypatch.setattr(Path, "home", lambda: tmp_path)
+
+    import importlib
+    import hermes_constants
+    importlib.reload(hermes_constants)
+    from agent import curator
+    importlib.reload(curator)
+    from tools import skill_usage
+    importlib.reload(skill_usage)
+    yield {"home": home, "curator": curator, "skill_usage": skill_usage}
+
+
+def _make_llm_meta(**overrides):
+    base = {
+        "final": "short summary of the pass",
+        "summary": "short summary",
+        "model": "test-model",
+        "provider": "test-provider",
+        "tool_calls": [],
+        "error": None,
+    }
+    base.update(overrides)
+    return base
+
+
+def test_reports_root_is_under_logs_not_skills(curator_env):
+    """Reports live in logs/curator/, not skills/ — operational telemetry
+    belongs with the logs, not with user-authored skill data."""
+    curator = curator_env["curator"]
+    root = curator._reports_root()
+    home = curator_env["home"]
+    # Must be under logs/
+    assert root == home / "logs" / "curator"
+    # Must NOT be under skills/
+    assert "skills" not in root.parts
+
+
+def test_write_run_report_creates_both_files(curator_env):
+    """Each run writes both a run.json (machine) and a REPORT.md (human)."""
+    curator = curator_env["curator"]
+    start = datetime.now(timezone.utc)
+
+    run_dir = curator._write_run_report(
+        started_at=start,
+        elapsed_seconds=12.345,
+        auto_counts={"checked": 5, "marked_stale": 1, "archived": 0, "reactivated": 0},
+        auto_summary="1 marked stale",
+        before_report=[],
+        before_names=set(),
+        after_report=[],
+        llm_meta=_make_llm_meta(),
+    )
+    assert run_dir is not None
+    assert run_dir.is_dir()
+    assert (run_dir / "run.json").exists()
+    assert (run_dir / "REPORT.md").exists()
+
+    # The directory name is a timestamp under logs/curator/
+    assert run_dir.parent == curator._reports_root()
+
+
+def test_run_json_has_expected_shape(curator_env):
+    """run.json must carry the machine-readable fields downstream tooling needs."""
+    curator = curator_env["curator"]
+    start = datetime.now(timezone.utc)
+
+    before_report = [
+        {"name": "old-thing", "state": "active", "pinned": False},
+        {"name": "keeper", "state": "active", "pinned": True},
+    ]
+    after_report = [
+        {"name": "keeper", "state": "active", "pinned": True},
+        {"name": "new-umbrella", "state": "active", "pinned": False},
+    ]
+
+    run_dir = curator._write_run_report(
+        started_at=start,
+        elapsed_seconds=42.0,
+        auto_counts={"checked": 2, "marked_stale": 0, "archived": 0, "reactivated": 0},
+        auto_summary="no changes",
+        before_report=before_report,
+        before_names={r["name"] for r in before_report},
+        after_report=after_report,
+        llm_meta=_make_llm_meta(
+            final="I consolidated the whole universe.",
+            tool_calls=[
+                {"name": "skills_list", "arguments": "{}"},
+                {"name": "skill_manage", "arguments": '{"action":"create"}'},
+                {"name": "terminal", "arguments": "mv ..."},
+            ],
+        ),
+    )
+    payload = json.loads((run_dir / "run.json").read_text())
+
+    # top-level shape
+    for k in (
+        "started_at", "duration_seconds", "model", "provider",
+        "auto_transitions", "counts", "tool_call_counts",
+        "archived", "added", "state_transitions",
+        "llm_final", "llm_summary", "llm_error", "tool_calls",
+    ):
+        assert k in payload, f"missing key: {k}"
+
+    # Diff logic
+    assert payload["archived"] == ["old-thing"]
+    assert payload["added"] == ["new-umbrella"]
+    # Counts reflect the diff
+    assert payload["counts"]["before"] == 2
+    assert payload["counts"]["after"] == 2
+    assert payload["counts"]["archived_this_run"] == 1
+    assert payload["counts"]["added_this_run"] == 1
+    # Tool call counts are aggregated
+    assert payload["tool_call_counts"]["skills_list"] == 1
+    assert payload["tool_call_counts"]["skill_manage"] == 1
+    assert payload["tool_call_counts"]["terminal"] == 1
+    assert payload["counts"]["tool_calls_total"] == 3
+
+
+def test_report_md_is_human_readable(curator_env):
+    """REPORT.md should be a valid markdown doc with the key sections visible."""
+    curator = curator_env["curator"]
+    start = datetime.now(timezone.utc)
+
+    run_dir = curator._write_run_report(
+        started_at=start,
+        elapsed_seconds=75.0,
+        auto_counts={"checked": 10, "marked_stale": 2, "archived": 1, "reactivated": 0},
+        auto_summary="2 marked stale, 1 archived",
+        before_report=[{"name": "foo", "state": "active", "pinned": False}],
+        before_names={"foo"},
+        after_report=[{"name": "foo-umbrella", "state": "active", "pinned": False}],
+        llm_meta=_make_llm_meta(
+            final="Consolidated foo-like skills into foo-umbrella.",
+            model="claude-opus-4.7",
+            provider="openrouter",
+        ),
+    )
+    md = (run_dir / "REPORT.md").read_text()
+
+    # Structural checks
+    assert "# Curator run" in md
+    assert "Auto-transitions" in md
+    assert "LLM consolidation pass" in md
+    assert "Recovery" in md
+
+    # The model / provider we passed in show up
+    assert "claude-opus-4.7" in md
+    assert "openrouter" in md
+
+    # The added/archived lists are present
+    assert "Skills archived" in md
+    assert "`foo`" in md
+    assert "New skills this run" in md
+    assert "`foo-umbrella`" in md
+
+    # The full LLM final response is included verbatim (no 240-char truncation)
+    assert "Consolidated foo-like skills into foo-umbrella." in md
+
+
+def test_same_second_reruns_get_unique_dirs(curator_env):
+    """If the curator somehow runs twice in the same second, the second
+    report still gets its own directory rather than overwriting the first."""
+    curator = curator_env["curator"]
+    start = datetime(2026, 4, 29, 5, 33, 34, tzinfo=timezone.utc)
+
+    kwargs = dict(
+        started_at=start,
+        elapsed_seconds=1.0,
+        auto_counts={"checked": 0, "marked_stale": 0, "archived": 0, "reactivated": 0},
+        auto_summary="no changes",
+        before_report=[],
+        before_names=set(),
+        after_report=[],
+        llm_meta=_make_llm_meta(),
+    )
+    a = curator._write_run_report(**kwargs)
+    b = curator._write_run_report(**kwargs)
+    assert a != b
+    assert a is not None and b is not None
+    # Second dir has a numeric disambiguator suffix
+    assert b.name.startswith(a.name)
+
+
+def test_report_captures_llm_error_and_continues(curator_env):
+    """If the LLM pass recorded an error, the report still writes and
+    surfaces the error prominently."""
+    curator = curator_env["curator"]
+    run_dir = curator._write_run_report(
+        started_at=datetime.now(timezone.utc),
+        elapsed_seconds=2.0,
+        auto_counts={"checked": 0, "marked_stale": 0, "archived": 0, "reactivated": 0},
+        auto_summary="no changes",
+        before_report=[],
+        before_names=set(),
+        after_report=[],
+        llm_meta=_make_llm_meta(
+            error="HTTP 400: No models provided",
+            final="",
+            summary="error",
+        ),
+    )
+    md = (run_dir / "REPORT.md").read_text()
+    assert "HTTP 400" in md
+    payload = json.loads((run_dir / "run.json").read_text())
+    assert payload["llm_error"] == "HTTP 400: No models provided"
+
+
+def test_state_transitions_captured_in_report(curator_env):
+    """When a skill moves active → stale or stale → archived between
+    before/after snapshots, the report records it."""
+    curator = curator_env["curator"]
+    start = datetime.now(timezone.utc)
+
+    before = [{"name": "getting-old", "state": "active", "pinned": False}]
+    after = [{"name": "getting-old", "state": "stale", "pinned": False}]
+
+    run_dir = curator._write_run_report(
+        started_at=start,
+        elapsed_seconds=1.0,
+        auto_counts={"checked": 1, "marked_stale": 1, "archived": 0, "reactivated": 0},
+        auto_summary="1 marked stale",
+        before_report=before,
+        before_names={r["name"] for r in before},
+        after_report=after,
+        llm_meta=_make_llm_meta(),
+    )
+    payload = json.loads((run_dir / "run.json").read_text())
+    assert payload["state_transitions"] == [
+        {"name": "getting-old", "from": "active", "to": "stale"}
+    ]
+    md = (run_dir / "REPORT.md").read_text()
+    assert "State transitions" in md
+    assert "getting-old" in md
+    assert "active → stale" in md