feat(background-review): aux-model selector for the self-improvement review (#49252)

Adds auxiliary.background_review.{provider,model} (default auto = main chat
model — unchanged). Set it to a different, cheaper model and the post-turn
self-improvement review runs there for ~3-5x lower cost.

Cache-aware by design: the main chat is warm in the prompt cache, so the
default full-history replay on the main model is cheap cache reads — left
exactly as-is. A different model can't reuse that cache (different key), so
when (and only when) routed to a different model the fork replays a compact
digest instead of the full transcript, minimising what it cold-writes on the
aux model. Same model -> full replay; different model -> digest.

Quality holds in benchmarks: memory capture identical, skill near-identical.
Nothing changes unless you opt in by naming a different model.

Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
Teknium 2026-06-22 14:54:53 -07:00 committed by GitHub
parent 660e36f097
commit 87c4a5ebb8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 341 additions and 27 deletions

View file

@ -27,6 +27,131 @@ from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Background-review aux-model selector + routed digest.
#
# The review fork runs on the MAIN model by default ("auto"), replaying the
# full conversation — already warm in the prompt cache, so cheap cache reads.
# Optimal and unchanged. A user can route the review to a different, cheaper
# model via auxiliary.background_review.{provider,model}. A different model
# cannot reuse the parent's cache (different key), so the fork is cold
# regardless — replaying the full transcript would just cold-write it. So when
# (and only when) routed to a different model, we replay a compact DIGEST to
# minimise cold-written tokens. Same model -> full replay; different model ->
# digest. That's the whole policy.
# ---------------------------------------------------------------------------
def _resolve_review_runtime(agent: Any) -> Dict[str, Any]:
"""Resolve provider/model/credentials for the review fork.
Default (auto / unset / same as parent): inherit the parent's live runtime
(with codex_app_server -> codex_responses downgrade). ``routed`` is False
the fork uses the main model and the warm cache, exactly as before. When
``auxiliary.background_review.{provider,model}`` names a concrete model
different from the parent's, resolve that runtime and set ``routed=True``.
"""
parent_runtime = agent._current_main_runtime()
parent_api_mode = parent_runtime.get("api_mode") or None
if parent_api_mode == "codex_app_server":
parent_api_mode = "codex_responses"
parent = {
"provider": agent.provider,
"model": agent.model,
"api_key": parent_runtime.get("api_key") or None,
"base_url": parent_runtime.get("base_url") or None,
"api_mode": parent_api_mode,
"routed": False,
}
try:
from hermes_cli.config import load_config
cfg = load_config()
except Exception:
return parent
aux = cfg.get("auxiliary", {}) if isinstance(cfg.get("auxiliary"), dict) else {}
task = aux.get("background_review", {}) if isinstance(aux.get("background_review"), dict) else {}
task_provider = (str(task.get("provider", "")).strip() or None)
task_model = (str(task.get("model", "")).strip() or None)
task_base_url = (str(task.get("base_url", "")).strip() or None)
task_api_key = (str(task.get("api_key", "")).strip() or None)
if not (task_provider and task_provider != "auto" and task_model):
return parent
if task_provider == (agent.provider or "") and task_model == (agent.model or ""):
return parent # same model/provider as parent -> not routed
try:
from hermes_cli.runtime_provider import resolve_runtime_provider
rp = resolve_runtime_provider(
requested=task_provider,
target_model=task_model,
explicit_api_key=task_api_key,
explicit_base_url=task_base_url,
)
return {
"provider": rp.get("provider") or task_provider,
"model": task_model,
"api_key": rp.get("api_key"),
"base_url": rp.get("base_url"),
"api_mode": rp.get("api_mode"),
"routed": True,
}
except Exception as e:
logger.debug("background-review aux routing failed (%s); using main model", e)
return parent
def _msg_text(m: Dict) -> str:
c = m.get("content")
if isinstance(c, str):
return c.strip()
if isinstance(c, list):
return " ".join(b.get("text", "") for b in c if isinstance(b, dict)).strip()
return ""
def _digest_history(messages_snapshot: List[Dict], tail: int = 24) -> List[Dict]:
"""Compact replay for the routed (different-model) path only.
Keeps the recent ``tail`` messages verbatim, collapses older turns into one
synthetic user-role digest, preserving role alternation. Used ONLY when
routed to a different model (cache cold regardless, so fewer cold-written
tokens is a pure win). Never on the main-model path (full replay stays warm).
"""
msgs = list(messages_snapshot or [])
if len(msgs) <= tail:
return msgs
keep = msgs[-tail:]
while keep and isinstance(keep[0], dict) and keep[0].get("role") == "tool":
tail += 1
if len(msgs) <= tail:
return msgs
keep = msgs[-tail:]
old = msgs[:-len(keep)]
lines: List[str] = []
for m in old:
if not isinstance(m, dict):
continue
role = m.get("role")
text = _msg_text(m).replace("\n", " ")
if role == "user" and text:
lines.append(f"USER: {text[:300]}")
elif role == "assistant":
tcs = m.get("tool_calls") or []
if tcs:
names = [(tc.get("function") or {}).get("name", "?") for tc in tcs if isinstance(tc, dict)]
lines.append(f"ASSISTANT[tools: {', '.join(names)}]")
if text:
lines.append(f"ASSISTANT: {text[:200]}")
digest = {
"role": "user",
"content": (
"[Earlier conversation digest — older turns summarised to bound the "
"review's cold-write cost on the routed aux model. Recent turns "
"follow verbatim below.]\n" + "\n".join(lines)
),
}
return [digest] + keep
# Review-prompt strings — used by ``spawn_background_review_thread`` to build
# the user-message that the forked review agent receives. AIAgent exposes
# them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat;
@ -488,18 +613,13 @@ def _run_review_in_thread(
# creds, or credential-pool setups where the resolver can't
# reconstruct auth from scratch -- producing the spurious
# "No LLM provider configured" warning at end of turn.
_parent_runtime = agent._current_main_runtime()
_parent_api_mode = _parent_runtime.get("api_mode") or None
# The review fork needs to call agent-loop tools (memory,
# skill_manage). Those tools require Hermes' own dispatch,
# which the codex_app_server runtime bypasses entirely
# (it runs the turn inside codex's subprocess). So when
# the parent is on codex_app_server, downgrade the review
# fork to codex_responses — same auth/credentials, but
# talks to the OpenAI Responses API directly so Hermes
# owns the loop and the agent-loop tools dispatch.
if _parent_api_mode == "codex_app_server":
_parent_api_mode = "codex_responses"
# _resolve_review_runtime() returns the parent's live runtime by
# default (routed=False; main model, warm cache), or — when the user
# set auxiliary.background_review.{provider,model} to a different
# model — that model's runtime (routed=True). The codex_app_server
# -> codex_responses downgrade is applied inside the resolver.
_rt = _resolve_review_runtime(agent)
_routed = bool(_rt.get("routed"))
# skip_memory=True keeps the review fork from
# touching external memory plugins (honcho, mem0,
# supermemory, etc.). Without it, the fork's
@ -519,14 +639,14 @@ def _run_review_in_thread(
# in the request body — Anthropic's cache key includes it.
# (The runtime whitelist below still restricts dispatch.)
review_agent = AIAgent(
model=agent.model,
model=_rt.get("model") or agent.model,
max_iterations=16,
quiet_mode=True,
platform=agent.platform,
provider=agent.provider,
api_mode=_parent_api_mode,
base_url=_parent_runtime.get("base_url") or None,
api_key=_parent_runtime.get("api_key") or None,
provider=_rt.get("provider") or agent.provider,
api_mode=_rt.get("api_mode"),
base_url=_rt.get("base_url") or None,
api_key=_rt.get("api_key") or None,
credential_pool=getattr(agent, "_credential_pool", None),
parent_session_id=agent.session_id,
enabled_toolsets=getattr(agent, "enabled_toolsets", None),
@ -565,15 +685,20 @@ def _run_review_in_thread(
# issue #25322 and PR #17276 for the full analysis +
# measured impact (~26% end-to-end cost reduction on
# Sonnet 4.5).
review_agent._cached_system_prompt = agent._cached_system_prompt
# Defensive: pin session_start + session_id to the
# parent's so any code path that re-renders parts of
# the system prompt (compression, plugin hooks) still
# produces byte-identical output. The cached-prompt
# assignment above already short-circuits the normal
# rebuild path, but these pins guarantee parity even
# if a future code path bypasses the cache.
review_agent.session_start = agent.session_start
# Share the parent's warm cached system prompt ONLY when the review
# runs on the SAME model (not routed). When routed to a different
# model the parent's cached prompt is for the wrong model/cache key
# and would miss anyway, so let the routed fork build its own.
if not _routed:
review_agent._cached_system_prompt = agent._cached_system_prompt
# Defensive: pin session_start + session_id to the
# parent's so any code path that re-renders parts of
# the system prompt (compression, plugin hooks) still
# produces byte-identical output. The cached-prompt
# assignment above already short-circuits the normal
# rebuild path, but these pins guarantee parity even
# if a future code path bypasses the cache.
review_agent.session_start = agent.session_start
review_agent.session_id = agent.session_id
# The fork shares the parent's live session_id (pinned above for
# prefix-cache parity). It is single-lifecycle and calls close()
@ -615,6 +740,13 @@ def _run_review_in_thread(
),
)
try:
# Routed to a different model -> replay a digest (cache is cold
# on that model anyway, so minimise cold-written tokens). Same
# model -> replay the full snapshot (warm cache reads).
_review_history = (
_digest_history(messages_snapshot) if _routed
else messages_snapshot
)
review_agent.run_conversation(
user_message=(
prompt
@ -622,7 +754,7 @@ def _run_review_in_thread(
"management tools. Other tools will be denied "
"at runtime — do not attempt them."
),
conversation_history=messages_snapshot,
conversation_history=_review_history,
)
finally:
clear_thread_tool_whitelist()

View file

@ -1535,6 +1535,25 @@ DEFAULT_CONFIG = {
"timeout": 60,
"extra_body": {},
},
# Background review — the post-turn self-improvement fork that decides
# whether to save a memory / patch a skill. "auto" (default) = run on
# the main chat model, replaying the full conversation, which is already
# warm in the prompt cache (cheap cache reads) — unchanged, optimal.
# Set provider/model to a cheaper model (e.g. openrouter
# google/gemini-3-flash-preview) to run the review there for ~3-5x lower
# cost. A different model can't reuse the main prompt cache anyway, so
# the fork automatically replays a compact digest instead of the full
# transcript when routed (minimises the cold-write). Same model = full
# replay; different model = digest. Quality holds (memory capture
# identical, skill near-identical in benchmarks).
"background_review": {
"provider": "auto",
"model": "",
"base_url": "",
"api_key": "",
"timeout": 120,
"extra_body": {},
},
},
"display": {

View file

@ -0,0 +1,138 @@
"""Unit coverage for the background-review aux-model selector + routed digest.
Covers the two behaviors this change adds:
_resolve_review_runtime auto/same-model not routed (main model, warm
cache); a configured different model routed with resolved credentials.
_digest_history compact replay used ONLY on the routed path (recent tail
verbatim + a digest of older turns), preserving role alternation.
Pure-function / config-driven; no live model calls.
"""
from unittest.mock import patch
from agent import background_review as br
def _msg(role, content, tool_calls=None):
m = {"role": role, "content": content}
if tool_calls:
m["tool_calls"] = tool_calls
return m
# ---------------------------------------------------------------------------
# _resolve_review_runtime — the aux-model selector
# ---------------------------------------------------------------------------
class _FakeAgent:
def __init__(self, provider="openai-codex", model="gpt-5.5"):
self.provider = provider
self.model = model
def _current_main_runtime(self):
return {
"api_key": "parent-key",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_mode": "codex_app_server",
}
def test_routing_auto_inherits_parent_and_downgrades_codex_app_server():
agent = _FakeAgent()
cfg = {"auxiliary": {"background_review": {"provider": "auto", "model": ""}}}
with patch("hermes_cli.config.load_config", return_value=cfg):
rt = br._resolve_review_runtime(agent)
assert rt["routed"] is False
assert rt["provider"] == "openai-codex"
assert rt["model"] == "gpt-5.5"
assert rt["api_mode"] == "codex_responses" # downgraded so agent-loop tools dispatch
def test_routing_to_different_model_marks_routed_and_resolves_credentials():
agent = _FakeAgent()
cfg = {"auxiliary": {"background_review": {
"provider": "openrouter", "model": "google/gemini-3-flash-preview",
}}}
fake_rp = {
"provider": "openrouter", "api_key": "or-key",
"base_url": "https://openrouter.ai/api/v1", "api_mode": "chat_completions",
}
with patch("hermes_cli.config.load_config", return_value=cfg), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value=fake_rp):
rt = br._resolve_review_runtime(agent)
assert rt["routed"] is True
assert rt["provider"] == "openrouter"
assert rt["model"] == "google/gemini-3-flash-preview"
assert rt["api_key"] == "or-key"
def test_routing_same_model_as_parent_is_not_routed():
agent = _FakeAgent(provider="openrouter", model="anthropic/claude-opus-4.8")
cfg = {"auxiliary": {"background_review": {
"provider": "openrouter", "model": "anthropic/claude-opus-4.8",
}}}
with patch("hermes_cli.config.load_config", return_value=cfg):
rt = br._resolve_review_runtime(agent)
assert rt["routed"] is False # same model/provider → keep full-replay path
def test_routing_resolution_failure_falls_back_to_parent():
agent = _FakeAgent()
cfg = {"auxiliary": {"background_review": {
"provider": "openrouter", "model": "google/gemini-3-flash-preview",
}}}
with patch("hermes_cli.config.load_config", return_value=cfg), \
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
side_effect=RuntimeError("boom")):
rt = br._resolve_review_runtime(agent)
assert rt["routed"] is False
assert rt["provider"] == "openai-codex"
# ---------------------------------------------------------------------------
# _digest_history — routed-path compact replay
# ---------------------------------------------------------------------------
def test_digest_under_tail_returns_full():
msgs = [_msg("user", "hi"), _msg("assistant", "hello")]
assert br._digest_history(msgs, tail=24) == msgs
def test_digest_collapses_old_keeps_tail_verbatim():
msgs = []
for i in range(60):
msgs.append(_msg("user", f"u{i} " + "x" * 50))
msgs.append(_msg("assistant", f"a{i} " + "y" * 50))
out = br._digest_history(msgs, tail=10)
# First message is the synthetic digest (user role → alternation preserved).
assert out[0]["role"] == "user"
assert out[0]["content"].startswith("[Earlier conversation digest")
# Recent tail preserved verbatim.
assert out[-1] == msgs[-1]
assert len(out) == 11 # 1 digest + 10 tail
def test_digest_does_not_open_tail_on_a_tool_message():
msgs = []
for i in range(40):
msgs.append(_msg("user", "u" + "x" * 50))
msgs.append(_msg("assistant", "", tool_calls=[
{"function": {"name": "terminal", "arguments": "{}"}}]))
msgs.append({"role": "tool", "content": "result " + "w" * 50})
out = br._digest_history(msgs, tail=2)
# The verbatim tail (after the digest) must not begin on a bare tool message.
assert out[1]["role"] != "tool"
def test_digest_records_tool_names_in_arc():
old = [
_msg("user", "do the thing"),
_msg("assistant", "", tool_calls=[
{"function": {"name": "skill_view", "arguments": "{}"}},
{"function": {"name": "patch", "arguments": "{}"}}]),
]
msgs = old + [_msg("user", f"tail{i}") for i in range(30)]
out = br._digest_history(msgs, tail=10)
digest = out[0]["content"]
assert "USER: do the thing" in digest
assert "tools: skill_view, patch" in digest

View file

@ -270,6 +270,31 @@ display:
> writes to your memory/skill stores, are unaffected by this setting. Set it
> per-platform via `display.platforms.<platform>.memory_notifications`.
## Running the review on a cheaper model (`auxiliary.background_review`)
The review runs on your **main chat model** by default, replaying the
conversation — which is already warm in the prompt cache, so it's cheap cache
reads. On an expensive main model you can run the review on a cheaper model
instead:
```yaml
auxiliary:
background_review:
provider: openrouter
model: google/gemini-3-flash-preview # auto (default) = main chat model
```
When you point it at a model **different** from your main one, the review runs
there for substantially lower cost (~35× in benchmarks). Because a different
model can't reuse your main model's prompt cache anyway, the fork automatically
replays a compact **digest** of the conversation (recent turns verbatim + a
summary of older ones) rather than the full transcript — minimizing what it
writes to the new cache. Capture holds: in testing, memory capture was
identical and skill capture near-identical to the main-model review.
Leave it at `auto` (or set it to your main model) and nothing changes — the
review keeps running on the main model with the full warm-cache replay.
## Controlling skill writes (`skills.write_approval`)
Skills use the same on/off gate, but the review UX differs because a