feat(background-review): aux-model selector for the self-improvement review (#49252)

Adds auxiliary.background_review.{provider,model} (default auto = main chat model — unchanged). Set it to a different, cheaper model and the post-turn self-improvement review runs there for ~3-5x lower cost. Cache-aware by design: the main chat is warm in the prompt cache, so the default full-history replay on the main model is cheap cache reads — left exactly as-is. A different model can't reuse that cache (different key), so when (and only when) routed to a different model the fork replays a compact digest instead of the full transcript, minimising what it cold-writes on the aux model. Same model -> full replay; different model -> digest. Quality holds in benchmarks: memory capture identical, skill near-identical. Nothing changes unless you opt in by naming a different model. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
2026-06-23 10:42:00 +00:00 · 2026-06-22 14:54:53 -07:00 · 2026-06-22 14:54:53 -07:00 · 87c4a5ebb8
commit 87c4a5ebb8
parent 660e36f097
4 changed files with 341 additions and 27 deletions
--- a/agent/background_review.py
+++ b/agent/background_review.py
@ -27,6 +27,131 @@ from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)


+# ---------------------------------------------------------------------------
+# Background-review aux-model selector + routed digest.
+#
+# The review fork runs on the MAIN model by default ("auto"), replaying the
+# full conversation — already warm in the prompt cache, so cheap cache reads.
+# Optimal and unchanged. A user can route the review to a different, cheaper
+# model via auxiliary.background_review.{provider,model}. A different model
+# cannot reuse the parent's cache (different key), so the fork is cold
+# regardless — replaying the full transcript would just cold-write it. So when
+# (and only when) routed to a different model, we replay a compact DIGEST to
+# minimise cold-written tokens. Same model -> full replay; different model ->
+# digest. That's the whole policy.
+# ---------------------------------------------------------------------------
+
+
+def _resolve_review_runtime(agent: Any) -> Dict[str, Any]:
+    """Resolve provider/model/credentials for the review fork.
+
+    Default (auto / unset / same as parent): inherit the parent's live runtime
+    (with codex_app_server -> codex_responses downgrade). ``routed`` is False —
+    the fork uses the main model and the warm cache, exactly as before. When
+    ``auxiliary.background_review.{provider,model}`` names a concrete model
+    different from the parent's, resolve that runtime and set ``routed=True``.
+    """
+    parent_runtime = agent._current_main_runtime()
+    parent_api_mode = parent_runtime.get("api_mode") or None
+    if parent_api_mode == "codex_app_server":
+        parent_api_mode = "codex_responses"
+    parent = {
+        "provider": agent.provider,
+        "model": agent.model,
+        "api_key": parent_runtime.get("api_key") or None,
+        "base_url": parent_runtime.get("base_url") or None,
+        "api_mode": parent_api_mode,
+        "routed": False,
+    }
+    try:
+        from hermes_cli.config import load_config
+        cfg = load_config()
+    except Exception:
+        return parent
+    aux = cfg.get("auxiliary", {}) if isinstance(cfg.get("auxiliary"), dict) else {}
+    task = aux.get("background_review", {}) if isinstance(aux.get("background_review"), dict) else {}
+    task_provider = (str(task.get("provider", "")).strip() or None)
+    task_model = (str(task.get("model", "")).strip() or None)
+    task_base_url = (str(task.get("base_url", "")).strip() or None)
+    task_api_key = (str(task.get("api_key", "")).strip() or None)
+    if not (task_provider and task_provider != "auto" and task_model):
+        return parent
+    if task_provider == (agent.provider or "") and task_model == (agent.model or ""):
+        return parent  # same model/provider as parent -> not routed
+    try:
+        from hermes_cli.runtime_provider import resolve_runtime_provider
+        rp = resolve_runtime_provider(
+            requested=task_provider,
+            target_model=task_model,
+            explicit_api_key=task_api_key,
+            explicit_base_url=task_base_url,
+        )
+        return {
+            "provider": rp.get("provider") or task_provider,
+            "model": task_model,
+            "api_key": rp.get("api_key"),
+            "base_url": rp.get("base_url"),
+            "api_mode": rp.get("api_mode"),
+            "routed": True,
+        }
+    except Exception as e:
+        logger.debug("background-review aux routing failed (%s); using main model", e)
+        return parent
+
+
+def _msg_text(m: Dict) -> str:
+    c = m.get("content")
+    if isinstance(c, str):
+        return c.strip()
+    if isinstance(c, list):
+        return " ".join(b.get("text", "") for b in c if isinstance(b, dict)).strip()
+    return ""
+
+
+def _digest_history(messages_snapshot: List[Dict], tail: int = 24) -> List[Dict]:
+    """Compact replay for the routed (different-model) path only.
+
+    Keeps the recent ``tail`` messages verbatim, collapses older turns into one
+    synthetic user-role digest, preserving role alternation. Used ONLY when
+    routed to a different model (cache cold regardless, so fewer cold-written
+    tokens is a pure win). Never on the main-model path (full replay stays warm).
+    """
+    msgs = list(messages_snapshot or [])
+    if len(msgs) <= tail:
+        return msgs
+    keep = msgs[-tail:]
+    while keep and isinstance(keep[0], dict) and keep[0].get("role") == "tool":
+        tail += 1
+        if len(msgs) <= tail:
+            return msgs
+        keep = msgs[-tail:]
+    old = msgs[:-len(keep)]
+    lines: List[str] = []
+    for m in old:
+        if not isinstance(m, dict):
+            continue
+        role = m.get("role")
+        text = _msg_text(m).replace("\n", " ")
+        if role == "user" and text:
+            lines.append(f"USER: {text[:300]}")
+        elif role == "assistant":
+            tcs = m.get("tool_calls") or []
+            if tcs:
+                names = [(tc.get("function") or {}).get("name", "?") for tc in tcs if isinstance(tc, dict)]
+                lines.append(f"ASSISTANT[tools: {', '.join(names)}]")
+            if text:
+                lines.append(f"ASSISTANT: {text[:200]}")
+    digest = {
+        "role": "user",
+        "content": (
+            "[Earlier conversation digest — older turns summarised to bound the "
+            "review's cold-write cost on the routed aux model. Recent turns "
+            "follow verbatim below.]\n" + "\n".join(lines)
+        ),
+    }
+    return [digest] + keep
+
+
 # Review-prompt strings — used by ``spawn_background_review_thread`` to build
 # the user-message that the forked review agent receives.  AIAgent exposes
 # them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat;
@ -488,18 +613,13 @@ def _run_review_in_thread(
            # creds, or credential-pool setups where the resolver can't
            # reconstruct auth from scratch -- producing the spurious
            # "No LLM provider configured" warning at end of turn.
-            _parent_runtime = agent._current_main_runtime()
-            _parent_api_mode = _parent_runtime.get("api_mode") or None
-            # The review fork needs to call agent-loop tools (memory,
-            # skill_manage). Those tools require Hermes' own dispatch,
-            # which the codex_app_server runtime bypasses entirely
-            # (it runs the turn inside codex's subprocess). So when
-            # the parent is on codex_app_server, downgrade the review
-            # fork to codex_responses — same auth/credentials, but
-            # talks to the OpenAI Responses API directly so Hermes
-            # owns the loop and the agent-loop tools dispatch.
-            if _parent_api_mode == "codex_app_server":
-                _parent_api_mode = "codex_responses"
+            # _resolve_review_runtime() returns the parent's live runtime by
+            # default (routed=False; main model, warm cache), or — when the user
+            # set auxiliary.background_review.{provider,model} to a different
+            # model — that model's runtime (routed=True). The codex_app_server
+            # -> codex_responses downgrade is applied inside the resolver.
+            _rt = _resolve_review_runtime(agent)
+            _routed = bool(_rt.get("routed"))
            # skip_memory=True keeps the review fork from
            # touching external memory plugins (honcho, mem0,
            # supermemory, etc.).  Without it, the fork's
@ -519,14 +639,14 @@ def _run_review_in_thread(
            # in the request body — Anthropic's cache key includes it.
            # (The runtime whitelist below still restricts dispatch.)
            review_agent = AIAgent(
-                model=agent.model,
+                model=_rt.get("model") or agent.model,
                max_iterations=16,
                quiet_mode=True,
                platform=agent.platform,
-                provider=agent.provider,
-                api_mode=_parent_api_mode,
-                base_url=_parent_runtime.get("base_url") or None,
-                api_key=_parent_runtime.get("api_key") or None,
+                provider=_rt.get("provider") or agent.provider,
+                api_mode=_rt.get("api_mode"),
+                base_url=_rt.get("base_url") or None,
+                api_key=_rt.get("api_key") or None,
                credential_pool=getattr(agent, "_credential_pool", None),
                parent_session_id=agent.session_id,
                enabled_toolsets=getattr(agent, "enabled_toolsets", None),
@ -565,15 +685,20 @@ def _run_review_in_thread(
            # issue #25322 and PR #17276 for the full analysis +
            # measured impact (~26% end-to-end cost reduction on
            # Sonnet 4.5).
-            review_agent._cached_system_prompt = agent._cached_system_prompt
-            # Defensive: pin session_start + session_id to the
-            # parent's so any code path that re-renders parts of
-            # the system prompt (compression, plugin hooks) still
-            # produces byte-identical output. The cached-prompt
-            # assignment above already short-circuits the normal
-            # rebuild path, but these pins guarantee parity even
-            # if a future code path bypasses the cache.
-            review_agent.session_start = agent.session_start
+            # Share the parent's warm cached system prompt ONLY when the review
+            # runs on the SAME model (not routed). When routed to a different
+            # model the parent's cached prompt is for the wrong model/cache key
+            # and would miss anyway, so let the routed fork build its own.
+            if not _routed:
+                review_agent._cached_system_prompt = agent._cached_system_prompt
+                # Defensive: pin session_start + session_id to the
+                # parent's so any code path that re-renders parts of
+                # the system prompt (compression, plugin hooks) still
+                # produces byte-identical output. The cached-prompt
+                # assignment above already short-circuits the normal
+                # rebuild path, but these pins guarantee parity even
+                # if a future code path bypasses the cache.
+                review_agent.session_start = agent.session_start
            review_agent.session_id = agent.session_id
            # The fork shares the parent's live session_id (pinned above for
            # prefix-cache parity). It is single-lifecycle and calls close()
@ -615,6 +740,13 @@ def _run_review_in_thread(
                ),
            )
            try:
+                # Routed to a different model -> replay a digest (cache is cold
+                # on that model anyway, so minimise cold-written tokens). Same
+                # model -> replay the full snapshot (warm cache reads).
+                _review_history = (
+                    _digest_history(messages_snapshot) if _routed
+                    else messages_snapshot
+                )
                review_agent.run_conversation(
                    user_message=(
                        prompt
@ -622,7 +754,7 @@ def _run_review_in_thread(
                        "management tools. Other tools will be denied "
                        "at runtime — do not attempt them."
                    ),
-                    conversation_history=messages_snapshot,
+                    conversation_history=_review_history,
                )
            finally:
                clear_thread_tool_whitelist()
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -1535,6 +1535,25 @@ DEFAULT_CONFIG = {
            "timeout": 60,
            "extra_body": {},
        },
+        # Background review — the post-turn self-improvement fork that decides
+        # whether to save a memory / patch a skill. "auto" (default) = run on
+        # the main chat model, replaying the full conversation, which is already
+        # warm in the prompt cache (cheap cache reads) — unchanged, optimal.
+        # Set provider/model to a cheaper model (e.g. openrouter
+        # google/gemini-3-flash-preview) to run the review there for ~3-5x lower
+        # cost. A different model can't reuse the main prompt cache anyway, so
+        # the fork automatically replays a compact digest instead of the full
+        # transcript when routed (minimises the cold-write). Same model = full
+        # replay; different model = digest. Quality holds (memory capture
+        # identical, skill near-identical in benchmarks).
+        "background_review": {
+            "provider": "auto",
+            "model": "",
+            "base_url": "",
+            "api_key": "",
+            "timeout": 120,
+            "extra_body": {},
+        },
    },
    
    "display": {
--- a/tests/run_agent/test_background_review_cost_controls.py
+++ b/tests/run_agent/test_background_review_cost_controls.py
@ -0,0 +1,138 @@
+"""Unit coverage for the background-review aux-model selector + routed digest.
+
+Covers the two behaviors this change adds:
+  • _resolve_review_runtime — auto/same-model → not routed (main model, warm
+    cache); a configured different model → routed with resolved credentials.
+  • _digest_history — compact replay used ONLY on the routed path (recent tail
+    verbatim + a digest of older turns), preserving role alternation.
+
+Pure-function / config-driven; no live model calls.
+"""
+from unittest.mock import patch
+
+from agent import background_review as br
+
+
+def _msg(role, content, tool_calls=None):
+    m = {"role": role, "content": content}
+    if tool_calls:
+        m["tool_calls"] = tool_calls
+    return m
+
+
+# ---------------------------------------------------------------------------
+# _resolve_review_runtime — the aux-model selector
+# ---------------------------------------------------------------------------
+
+class _FakeAgent:
+    def __init__(self, provider="openai-codex", model="gpt-5.5"):
+        self.provider = provider
+        self.model = model
+
+    def _current_main_runtime(self):
+        return {
+            "api_key": "parent-key",
+            "base_url": "https://chatgpt.com/backend-api/codex",
+            "api_mode": "codex_app_server",
+        }
+
+
+def test_routing_auto_inherits_parent_and_downgrades_codex_app_server():
+    agent = _FakeAgent()
+    cfg = {"auxiliary": {"background_review": {"provider": "auto", "model": ""}}}
+    with patch("hermes_cli.config.load_config", return_value=cfg):
+        rt = br._resolve_review_runtime(agent)
+    assert rt["routed"] is False
+    assert rt["provider"] == "openai-codex"
+    assert rt["model"] == "gpt-5.5"
+    assert rt["api_mode"] == "codex_responses"  # downgraded so agent-loop tools dispatch
+
+
+def test_routing_to_different_model_marks_routed_and_resolves_credentials():
+    agent = _FakeAgent()
+    cfg = {"auxiliary": {"background_review": {
+        "provider": "openrouter", "model": "google/gemini-3-flash-preview",
+    }}}
+    fake_rp = {
+        "provider": "openrouter", "api_key": "or-key",
+        "base_url": "https://openrouter.ai/api/v1", "api_mode": "chat_completions",
+    }
+    with patch("hermes_cli.config.load_config", return_value=cfg), \
+         patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value=fake_rp):
+        rt = br._resolve_review_runtime(agent)
+    assert rt["routed"] is True
+    assert rt["provider"] == "openrouter"
+    assert rt["model"] == "google/gemini-3-flash-preview"
+    assert rt["api_key"] == "or-key"
+
+
+def test_routing_same_model_as_parent_is_not_routed():
+    agent = _FakeAgent(provider="openrouter", model="anthropic/claude-opus-4.8")
+    cfg = {"auxiliary": {"background_review": {
+        "provider": "openrouter", "model": "anthropic/claude-opus-4.8",
+    }}}
+    with patch("hermes_cli.config.load_config", return_value=cfg):
+        rt = br._resolve_review_runtime(agent)
+    assert rt["routed"] is False  # same model/provider → keep full-replay path
+
+
+def test_routing_resolution_failure_falls_back_to_parent():
+    agent = _FakeAgent()
+    cfg = {"auxiliary": {"background_review": {
+        "provider": "openrouter", "model": "google/gemini-3-flash-preview",
+    }}}
+    with patch("hermes_cli.config.load_config", return_value=cfg), \
+         patch("hermes_cli.runtime_provider.resolve_runtime_provider",
+               side_effect=RuntimeError("boom")):
+        rt = br._resolve_review_runtime(agent)
+    assert rt["routed"] is False
+    assert rt["provider"] == "openai-codex"
+
+
+# ---------------------------------------------------------------------------
+# _digest_history — routed-path compact replay
+# ---------------------------------------------------------------------------
+
+def test_digest_under_tail_returns_full():
+    msgs = [_msg("user", "hi"), _msg("assistant", "hello")]
+    assert br._digest_history(msgs, tail=24) == msgs
+
+
+def test_digest_collapses_old_keeps_tail_verbatim():
+    msgs = []
+    for i in range(60):
+        msgs.append(_msg("user", f"u{i} " + "x" * 50))
+        msgs.append(_msg("assistant", f"a{i} " + "y" * 50))
+    out = br._digest_history(msgs, tail=10)
+    # First message is the synthetic digest (user role → alternation preserved).
+    assert out[0]["role"] == "user"
+    assert out[0]["content"].startswith("[Earlier conversation digest")
+    # Recent tail preserved verbatim.
+    assert out[-1] == msgs[-1]
+    assert len(out) == 11  # 1 digest + 10 tail
+
+
+def test_digest_does_not_open_tail_on_a_tool_message():
+    msgs = []
+    for i in range(40):
+        msgs.append(_msg("user", "u" + "x" * 50))
+        msgs.append(_msg("assistant", "", tool_calls=[
+            {"function": {"name": "terminal", "arguments": "{}"}}]))
+        msgs.append({"role": "tool", "content": "result " + "w" * 50})
+    out = br._digest_history(msgs, tail=2)
+    # The verbatim tail (after the digest) must not begin on a bare tool message.
+    assert out[1]["role"] != "tool"
+
+
+def test_digest_records_tool_names_in_arc():
+    old = [
+        _msg("user", "do the thing"),
+        _msg("assistant", "", tool_calls=[
+            {"function": {"name": "skill_view", "arguments": "{}"}},
+            {"function": {"name": "patch", "arguments": "{}"}}]),
+    ]
+    msgs = old + [_msg("user", f"tail{i}") for i in range(30)]
+    out = br._digest_history(msgs, tail=10)
+    digest = out[0]["content"]
+    assert "USER: do the thing" in digest
+    assert "tools: skill_view, patch" in digest
--- a/website/docs/user-guide/features/memory.md
+++ b/website/docs/user-guide/features/memory.md
@ -270,6 +270,31 @@ display:
 > writes to your memory/skill stores, are unaffected by this setting. Set it
 > per-platform via `display.platforms.<platform>.memory_notifications`.

+## Running the review on a cheaper model (`auxiliary.background_review`)
+
+The review runs on your **main chat model** by default, replaying the
+conversation — which is already warm in the prompt cache, so it's cheap cache
+reads. On an expensive main model you can run the review on a cheaper model
+instead:
+
+```yaml
+auxiliary:
+  background_review:
+    provider: openrouter
+    model: google/gemini-3-flash-preview   # auto (default) = main chat model
+```
+
+When you point it at a model **different** from your main one, the review runs
+there for substantially lower cost (~3–5× in benchmarks). Because a different
+model can't reuse your main model's prompt cache anyway, the fork automatically
+replays a compact **digest** of the conversation (recent turns verbatim + a
+summary of older ones) rather than the full transcript — minimizing what it
+writes to the new cache. Capture holds: in testing, memory capture was
+identical and skill capture near-identical to the main-model review.
+
+Leave it at `auto` (or set it to your main model) and nothing changes — the
+review keeps running on the main model with the full warm-cache replay.
+
 ## Controlling skill writes (`skills.write_approval`)

 Skills use the same on/off gate, but the review UX differs because a