diff --git a/agent/background_review.py b/agent/background_review.py index fa4de508e19..564c5441996 100644 --- a/agent/background_review.py +++ b/agent/background_review.py @@ -27,6 +27,131 @@ from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Background-review aux-model selector + routed digest. +# +# The review fork runs on the MAIN model by default ("auto"), replaying the +# full conversation — already warm in the prompt cache, so cheap cache reads. +# Optimal and unchanged. A user can route the review to a different, cheaper +# model via auxiliary.background_review.{provider,model}. A different model +# cannot reuse the parent's cache (different key), so the fork is cold +# regardless — replaying the full transcript would just cold-write it. So when +# (and only when) routed to a different model, we replay a compact DIGEST to +# minimise cold-written tokens. Same model -> full replay; different model -> +# digest. That's the whole policy. +# --------------------------------------------------------------------------- + + +def _resolve_review_runtime(agent: Any) -> Dict[str, Any]: + """Resolve provider/model/credentials for the review fork. + + Default (auto / unset / same as parent): inherit the parent's live runtime + (with codex_app_server -> codex_responses downgrade). ``routed`` is False — + the fork uses the main model and the warm cache, exactly as before. When + ``auxiliary.background_review.{provider,model}`` names a concrete model + different from the parent's, resolve that runtime and set ``routed=True``. + """ + parent_runtime = agent._current_main_runtime() + parent_api_mode = parent_runtime.get("api_mode") or None + if parent_api_mode == "codex_app_server": + parent_api_mode = "codex_responses" + parent = { + "provider": agent.provider, + "model": agent.model, + "api_key": parent_runtime.get("api_key") or None, + "base_url": parent_runtime.get("base_url") or None, + "api_mode": parent_api_mode, + "routed": False, + } + try: + from hermes_cli.config import load_config + cfg = load_config() + except Exception: + return parent + aux = cfg.get("auxiliary", {}) if isinstance(cfg.get("auxiliary"), dict) else {} + task = aux.get("background_review", {}) if isinstance(aux.get("background_review"), dict) else {} + task_provider = (str(task.get("provider", "")).strip() or None) + task_model = (str(task.get("model", "")).strip() or None) + task_base_url = (str(task.get("base_url", "")).strip() or None) + task_api_key = (str(task.get("api_key", "")).strip() or None) + if not (task_provider and task_provider != "auto" and task_model): + return parent + if task_provider == (agent.provider or "") and task_model == (agent.model or ""): + return parent # same model/provider as parent -> not routed + try: + from hermes_cli.runtime_provider import resolve_runtime_provider + rp = resolve_runtime_provider( + requested=task_provider, + target_model=task_model, + explicit_api_key=task_api_key, + explicit_base_url=task_base_url, + ) + return { + "provider": rp.get("provider") or task_provider, + "model": task_model, + "api_key": rp.get("api_key"), + "base_url": rp.get("base_url"), + "api_mode": rp.get("api_mode"), + "routed": True, + } + except Exception as e: + logger.debug("background-review aux routing failed (%s); using main model", e) + return parent + + +def _msg_text(m: Dict) -> str: + c = m.get("content") + if isinstance(c, str): + return c.strip() + if isinstance(c, list): + return " ".join(b.get("text", "") for b in c if isinstance(b, dict)).strip() + return "" + + +def _digest_history(messages_snapshot: List[Dict], tail: int = 24) -> List[Dict]: + """Compact replay for the routed (different-model) path only. + + Keeps the recent ``tail`` messages verbatim, collapses older turns into one + synthetic user-role digest, preserving role alternation. Used ONLY when + routed to a different model (cache cold regardless, so fewer cold-written + tokens is a pure win). Never on the main-model path (full replay stays warm). + """ + msgs = list(messages_snapshot or []) + if len(msgs) <= tail: + return msgs + keep = msgs[-tail:] + while keep and isinstance(keep[0], dict) and keep[0].get("role") == "tool": + tail += 1 + if len(msgs) <= tail: + return msgs + keep = msgs[-tail:] + old = msgs[:-len(keep)] + lines: List[str] = [] + for m in old: + if not isinstance(m, dict): + continue + role = m.get("role") + text = _msg_text(m).replace("\n", " ") + if role == "user" and text: + lines.append(f"USER: {text[:300]}") + elif role == "assistant": + tcs = m.get("tool_calls") or [] + if tcs: + names = [(tc.get("function") or {}).get("name", "?") for tc in tcs if isinstance(tc, dict)] + lines.append(f"ASSISTANT[tools: {', '.join(names)}]") + if text: + lines.append(f"ASSISTANT: {text[:200]}") + digest = { + "role": "user", + "content": ( + "[Earlier conversation digest — older turns summarised to bound the " + "review's cold-write cost on the routed aux model. Recent turns " + "follow verbatim below.]\n" + "\n".join(lines) + ), + } + return [digest] + keep + + # Review-prompt strings — used by ``spawn_background_review_thread`` to build # the user-message that the forked review agent receives. AIAgent exposes # them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat; @@ -488,18 +613,13 @@ def _run_review_in_thread( # creds, or credential-pool setups where the resolver can't # reconstruct auth from scratch -- producing the spurious # "No LLM provider configured" warning at end of turn. - _parent_runtime = agent._current_main_runtime() - _parent_api_mode = _parent_runtime.get("api_mode") or None - # The review fork needs to call agent-loop tools (memory, - # skill_manage). Those tools require Hermes' own dispatch, - # which the codex_app_server runtime bypasses entirely - # (it runs the turn inside codex's subprocess). So when - # the parent is on codex_app_server, downgrade the review - # fork to codex_responses — same auth/credentials, but - # talks to the OpenAI Responses API directly so Hermes - # owns the loop and the agent-loop tools dispatch. - if _parent_api_mode == "codex_app_server": - _parent_api_mode = "codex_responses" + # _resolve_review_runtime() returns the parent's live runtime by + # default (routed=False; main model, warm cache), or — when the user + # set auxiliary.background_review.{provider,model} to a different + # model — that model's runtime (routed=True). The codex_app_server + # -> codex_responses downgrade is applied inside the resolver. + _rt = _resolve_review_runtime(agent) + _routed = bool(_rt.get("routed")) # skip_memory=True keeps the review fork from # touching external memory plugins (honcho, mem0, # supermemory, etc.). Without it, the fork's @@ -519,14 +639,14 @@ def _run_review_in_thread( # in the request body — Anthropic's cache key includes it. # (The runtime whitelist below still restricts dispatch.) review_agent = AIAgent( - model=agent.model, + model=_rt.get("model") or agent.model, max_iterations=16, quiet_mode=True, platform=agent.platform, - provider=agent.provider, - api_mode=_parent_api_mode, - base_url=_parent_runtime.get("base_url") or None, - api_key=_parent_runtime.get("api_key") or None, + provider=_rt.get("provider") or agent.provider, + api_mode=_rt.get("api_mode"), + base_url=_rt.get("base_url") or None, + api_key=_rt.get("api_key") or None, credential_pool=getattr(agent, "_credential_pool", None), parent_session_id=agent.session_id, enabled_toolsets=getattr(agent, "enabled_toolsets", None), @@ -565,15 +685,20 @@ def _run_review_in_thread( # issue #25322 and PR #17276 for the full analysis + # measured impact (~26% end-to-end cost reduction on # Sonnet 4.5). - review_agent._cached_system_prompt = agent._cached_system_prompt - # Defensive: pin session_start + session_id to the - # parent's so any code path that re-renders parts of - # the system prompt (compression, plugin hooks) still - # produces byte-identical output. The cached-prompt - # assignment above already short-circuits the normal - # rebuild path, but these pins guarantee parity even - # if a future code path bypasses the cache. - review_agent.session_start = agent.session_start + # Share the parent's warm cached system prompt ONLY when the review + # runs on the SAME model (not routed). When routed to a different + # model the parent's cached prompt is for the wrong model/cache key + # and would miss anyway, so let the routed fork build its own. + if not _routed: + review_agent._cached_system_prompt = agent._cached_system_prompt + # Defensive: pin session_start + session_id to the + # parent's so any code path that re-renders parts of + # the system prompt (compression, plugin hooks) still + # produces byte-identical output. The cached-prompt + # assignment above already short-circuits the normal + # rebuild path, but these pins guarantee parity even + # if a future code path bypasses the cache. + review_agent.session_start = agent.session_start review_agent.session_id = agent.session_id # The fork shares the parent's live session_id (pinned above for # prefix-cache parity). It is single-lifecycle and calls close() @@ -615,6 +740,13 @@ def _run_review_in_thread( ), ) try: + # Routed to a different model -> replay a digest (cache is cold + # on that model anyway, so minimise cold-written tokens). Same + # model -> replay the full snapshot (warm cache reads). + _review_history = ( + _digest_history(messages_snapshot) if _routed + else messages_snapshot + ) review_agent.run_conversation( user_message=( prompt @@ -622,7 +754,7 @@ def _run_review_in_thread( "management tools. Other tools will be denied " "at runtime — do not attempt them." ), - conversation_history=messages_snapshot, + conversation_history=_review_history, ) finally: clear_thread_tool_whitelist() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ce8ec7d6693..34923375984 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1535,6 +1535,25 @@ DEFAULT_CONFIG = { "timeout": 60, "extra_body": {}, }, + # Background review — the post-turn self-improvement fork that decides + # whether to save a memory / patch a skill. "auto" (default) = run on + # the main chat model, replaying the full conversation, which is already + # warm in the prompt cache (cheap cache reads) — unchanged, optimal. + # Set provider/model to a cheaper model (e.g. openrouter + # google/gemini-3-flash-preview) to run the review there for ~3-5x lower + # cost. A different model can't reuse the main prompt cache anyway, so + # the fork automatically replays a compact digest instead of the full + # transcript when routed (minimises the cold-write). Same model = full + # replay; different model = digest. Quality holds (memory capture + # identical, skill near-identical in benchmarks). + "background_review": { + "provider": "auto", + "model": "", + "base_url": "", + "api_key": "", + "timeout": 120, + "extra_body": {}, + }, }, "display": { diff --git a/tests/run_agent/test_background_review_cost_controls.py b/tests/run_agent/test_background_review_cost_controls.py new file mode 100644 index 00000000000..5ca47b2a0f9 --- /dev/null +++ b/tests/run_agent/test_background_review_cost_controls.py @@ -0,0 +1,138 @@ +"""Unit coverage for the background-review aux-model selector + routed digest. + +Covers the two behaviors this change adds: + • _resolve_review_runtime — auto/same-model → not routed (main model, warm + cache); a configured different model → routed with resolved credentials. + • _digest_history — compact replay used ONLY on the routed path (recent tail + verbatim + a digest of older turns), preserving role alternation. + +Pure-function / config-driven; no live model calls. +""" +from unittest.mock import patch + +from agent import background_review as br + + +def _msg(role, content, tool_calls=None): + m = {"role": role, "content": content} + if tool_calls: + m["tool_calls"] = tool_calls + return m + + +# --------------------------------------------------------------------------- +# _resolve_review_runtime — the aux-model selector +# --------------------------------------------------------------------------- + +class _FakeAgent: + def __init__(self, provider="openai-codex", model="gpt-5.5"): + self.provider = provider + self.model = model + + def _current_main_runtime(self): + return { + "api_key": "parent-key", + "base_url": "https://chatgpt.com/backend-api/codex", + "api_mode": "codex_app_server", + } + + +def test_routing_auto_inherits_parent_and_downgrades_codex_app_server(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": {"provider": "auto", "model": ""}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False + assert rt["provider"] == "openai-codex" + assert rt["model"] == "gpt-5.5" + assert rt["api_mode"] == "codex_responses" # downgraded so agent-loop tools dispatch + + +def test_routing_to_different_model_marks_routed_and_resolves_credentials(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "google/gemini-3-flash-preview", + }}} + fake_rp = { + "provider": "openrouter", "api_key": "or-key", + "base_url": "https://openrouter.ai/api/v1", "api_mode": "chat_completions", + } + with patch("hermes_cli.config.load_config", return_value=cfg), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value=fake_rp): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is True + assert rt["provider"] == "openrouter" + assert rt["model"] == "google/gemini-3-flash-preview" + assert rt["api_key"] == "or-key" + + +def test_routing_same_model_as_parent_is_not_routed(): + agent = _FakeAgent(provider="openrouter", model="anthropic/claude-opus-4.8") + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "anthropic/claude-opus-4.8", + }}} + with patch("hermes_cli.config.load_config", return_value=cfg): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False # same model/provider → keep full-replay path + + +def test_routing_resolution_failure_falls_back_to_parent(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "google/gemini-3-flash-preview", + }}} + with patch("hermes_cli.config.load_config", return_value=cfg), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + side_effect=RuntimeError("boom")): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False + assert rt["provider"] == "openai-codex" + + +# --------------------------------------------------------------------------- +# _digest_history — routed-path compact replay +# --------------------------------------------------------------------------- + +def test_digest_under_tail_returns_full(): + msgs = [_msg("user", "hi"), _msg("assistant", "hello")] + assert br._digest_history(msgs, tail=24) == msgs + + +def test_digest_collapses_old_keeps_tail_verbatim(): + msgs = [] + for i in range(60): + msgs.append(_msg("user", f"u{i} " + "x" * 50)) + msgs.append(_msg("assistant", f"a{i} " + "y" * 50)) + out = br._digest_history(msgs, tail=10) + # First message is the synthetic digest (user role → alternation preserved). + assert out[0]["role"] == "user" + assert out[0]["content"].startswith("[Earlier conversation digest") + # Recent tail preserved verbatim. + assert out[-1] == msgs[-1] + assert len(out) == 11 # 1 digest + 10 tail + + +def test_digest_does_not_open_tail_on_a_tool_message(): + msgs = [] + for i in range(40): + msgs.append(_msg("user", "u" + "x" * 50)) + msgs.append(_msg("assistant", "", tool_calls=[ + {"function": {"name": "terminal", "arguments": "{}"}}])) + msgs.append({"role": "tool", "content": "result " + "w" * 50}) + out = br._digest_history(msgs, tail=2) + # The verbatim tail (after the digest) must not begin on a bare tool message. + assert out[1]["role"] != "tool" + + +def test_digest_records_tool_names_in_arc(): + old = [ + _msg("user", "do the thing"), + _msg("assistant", "", tool_calls=[ + {"function": {"name": "skill_view", "arguments": "{}"}}, + {"function": {"name": "patch", "arguments": "{}"}}]), + ] + msgs = old + [_msg("user", f"tail{i}") for i in range(30)] + out = br._digest_history(msgs, tail=10) + digest = out[0]["content"] + assert "USER: do the thing" in digest + assert "tools: skill_view, patch" in digest diff --git a/website/docs/user-guide/features/memory.md b/website/docs/user-guide/features/memory.md index 41efc92285c..20c37afa12f 100644 --- a/website/docs/user-guide/features/memory.md +++ b/website/docs/user-guide/features/memory.md @@ -270,6 +270,31 @@ display: > writes to your memory/skill stores, are unaffected by this setting. Set it > per-platform via `display.platforms..memory_notifications`. +## Running the review on a cheaper model (`auxiliary.background_review`) + +The review runs on your **main chat model** by default, replaying the +conversation — which is already warm in the prompt cache, so it's cheap cache +reads. On an expensive main model you can run the review on a cheaper model +instead: + +```yaml +auxiliary: + background_review: + provider: openrouter + model: google/gemini-3-flash-preview # auto (default) = main chat model +``` + +When you point it at a model **different** from your main one, the review runs +there for substantially lower cost (~3–5× in benchmarks). Because a different +model can't reuse your main model's prompt cache anyway, the fork automatically +replays a compact **digest** of the conversation (recent turns verbatim + a +summary of older ones) rather than the full transcript — minimizing what it +writes to the new cache. Capture holds: in testing, memory capture was +identical and skill capture near-identical to the main-model review. + +Leave it at `auto` (or set it to your main model) and nothing changes — the +review keeps running on the main model with the full warm-cache replay. + ## Controlling skill writes (`skills.write_approval`) Skills use the same on/off gate, but the review UX differs because a