mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
feat(background-review): aux-model selector for the self-improvement review (#49252)
Adds auxiliary.background_review.{provider,model} (default auto = main chat
model — unchanged). Set it to a different, cheaper model and the post-turn
self-improvement review runs there for ~3-5x lower cost.
Cache-aware by design: the main chat is warm in the prompt cache, so the
default full-history replay on the main model is cheap cache reads — left
exactly as-is. A different model can't reuse that cache (different key), so
when (and only when) routed to a different model the fork replays a compact
digest instead of the full transcript, minimising what it cold-writes on the
aux model. Same model -> full replay; different model -> digest.
Quality holds in benchmarks: memory capture identical, skill near-identical.
Nothing changes unless you opt in by naming a different model.
Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
parent
660e36f097
commit
87c4a5ebb8
4 changed files with 341 additions and 27 deletions
|
|
@ -27,6 +27,131 @@ from typing import Any, Dict, List, Optional
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Background-review aux-model selector + routed digest.
|
||||
#
|
||||
# The review fork runs on the MAIN model by default ("auto"), replaying the
|
||||
# full conversation — already warm in the prompt cache, so cheap cache reads.
|
||||
# Optimal and unchanged. A user can route the review to a different, cheaper
|
||||
# model via auxiliary.background_review.{provider,model}. A different model
|
||||
# cannot reuse the parent's cache (different key), so the fork is cold
|
||||
# regardless — replaying the full transcript would just cold-write it. So when
|
||||
# (and only when) routed to a different model, we replay a compact DIGEST to
|
||||
# minimise cold-written tokens. Same model -> full replay; different model ->
|
||||
# digest. That's the whole policy.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _resolve_review_runtime(agent: Any) -> Dict[str, Any]:
|
||||
"""Resolve provider/model/credentials for the review fork.
|
||||
|
||||
Default (auto / unset / same as parent): inherit the parent's live runtime
|
||||
(with codex_app_server -> codex_responses downgrade). ``routed`` is False —
|
||||
the fork uses the main model and the warm cache, exactly as before. When
|
||||
``auxiliary.background_review.{provider,model}`` names a concrete model
|
||||
different from the parent's, resolve that runtime and set ``routed=True``.
|
||||
"""
|
||||
parent_runtime = agent._current_main_runtime()
|
||||
parent_api_mode = parent_runtime.get("api_mode") or None
|
||||
if parent_api_mode == "codex_app_server":
|
||||
parent_api_mode = "codex_responses"
|
||||
parent = {
|
||||
"provider": agent.provider,
|
||||
"model": agent.model,
|
||||
"api_key": parent_runtime.get("api_key") or None,
|
||||
"base_url": parent_runtime.get("base_url") or None,
|
||||
"api_mode": parent_api_mode,
|
||||
"routed": False,
|
||||
}
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
except Exception:
|
||||
return parent
|
||||
aux = cfg.get("auxiliary", {}) if isinstance(cfg.get("auxiliary"), dict) else {}
|
||||
task = aux.get("background_review", {}) if isinstance(aux.get("background_review"), dict) else {}
|
||||
task_provider = (str(task.get("provider", "")).strip() or None)
|
||||
task_model = (str(task.get("model", "")).strip() or None)
|
||||
task_base_url = (str(task.get("base_url", "")).strip() or None)
|
||||
task_api_key = (str(task.get("api_key", "")).strip() or None)
|
||||
if not (task_provider and task_provider != "auto" and task_model):
|
||||
return parent
|
||||
if task_provider == (agent.provider or "") and task_model == (agent.model or ""):
|
||||
return parent # same model/provider as parent -> not routed
|
||||
try:
|
||||
from hermes_cli.runtime_provider import resolve_runtime_provider
|
||||
rp = resolve_runtime_provider(
|
||||
requested=task_provider,
|
||||
target_model=task_model,
|
||||
explicit_api_key=task_api_key,
|
||||
explicit_base_url=task_base_url,
|
||||
)
|
||||
return {
|
||||
"provider": rp.get("provider") or task_provider,
|
||||
"model": task_model,
|
||||
"api_key": rp.get("api_key"),
|
||||
"base_url": rp.get("base_url"),
|
||||
"api_mode": rp.get("api_mode"),
|
||||
"routed": True,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug("background-review aux routing failed (%s); using main model", e)
|
||||
return parent
|
||||
|
||||
|
||||
def _msg_text(m: Dict) -> str:
|
||||
c = m.get("content")
|
||||
if isinstance(c, str):
|
||||
return c.strip()
|
||||
if isinstance(c, list):
|
||||
return " ".join(b.get("text", "") for b in c if isinstance(b, dict)).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def _digest_history(messages_snapshot: List[Dict], tail: int = 24) -> List[Dict]:
|
||||
"""Compact replay for the routed (different-model) path only.
|
||||
|
||||
Keeps the recent ``tail`` messages verbatim, collapses older turns into one
|
||||
synthetic user-role digest, preserving role alternation. Used ONLY when
|
||||
routed to a different model (cache cold regardless, so fewer cold-written
|
||||
tokens is a pure win). Never on the main-model path (full replay stays warm).
|
||||
"""
|
||||
msgs = list(messages_snapshot or [])
|
||||
if len(msgs) <= tail:
|
||||
return msgs
|
||||
keep = msgs[-tail:]
|
||||
while keep and isinstance(keep[0], dict) and keep[0].get("role") == "tool":
|
||||
tail += 1
|
||||
if len(msgs) <= tail:
|
||||
return msgs
|
||||
keep = msgs[-tail:]
|
||||
old = msgs[:-len(keep)]
|
||||
lines: List[str] = []
|
||||
for m in old:
|
||||
if not isinstance(m, dict):
|
||||
continue
|
||||
role = m.get("role")
|
||||
text = _msg_text(m).replace("\n", " ")
|
||||
if role == "user" and text:
|
||||
lines.append(f"USER: {text[:300]}")
|
||||
elif role == "assistant":
|
||||
tcs = m.get("tool_calls") or []
|
||||
if tcs:
|
||||
names = [(tc.get("function") or {}).get("name", "?") for tc in tcs if isinstance(tc, dict)]
|
||||
lines.append(f"ASSISTANT[tools: {', '.join(names)}]")
|
||||
if text:
|
||||
lines.append(f"ASSISTANT: {text[:200]}")
|
||||
digest = {
|
||||
"role": "user",
|
||||
"content": (
|
||||
"[Earlier conversation digest — older turns summarised to bound the "
|
||||
"review's cold-write cost on the routed aux model. Recent turns "
|
||||
"follow verbatim below.]\n" + "\n".join(lines)
|
||||
),
|
||||
}
|
||||
return [digest] + keep
|
||||
|
||||
|
||||
# Review-prompt strings — used by ``spawn_background_review_thread`` to build
|
||||
# the user-message that the forked review agent receives. AIAgent exposes
|
||||
# them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat;
|
||||
|
|
@ -488,18 +613,13 @@ def _run_review_in_thread(
|
|||
# creds, or credential-pool setups where the resolver can't
|
||||
# reconstruct auth from scratch -- producing the spurious
|
||||
# "No LLM provider configured" warning at end of turn.
|
||||
_parent_runtime = agent._current_main_runtime()
|
||||
_parent_api_mode = _parent_runtime.get("api_mode") or None
|
||||
# The review fork needs to call agent-loop tools (memory,
|
||||
# skill_manage). Those tools require Hermes' own dispatch,
|
||||
# which the codex_app_server runtime bypasses entirely
|
||||
# (it runs the turn inside codex's subprocess). So when
|
||||
# the parent is on codex_app_server, downgrade the review
|
||||
# fork to codex_responses — same auth/credentials, but
|
||||
# talks to the OpenAI Responses API directly so Hermes
|
||||
# owns the loop and the agent-loop tools dispatch.
|
||||
if _parent_api_mode == "codex_app_server":
|
||||
_parent_api_mode = "codex_responses"
|
||||
# _resolve_review_runtime() returns the parent's live runtime by
|
||||
# default (routed=False; main model, warm cache), or — when the user
|
||||
# set auxiliary.background_review.{provider,model} to a different
|
||||
# model — that model's runtime (routed=True). The codex_app_server
|
||||
# -> codex_responses downgrade is applied inside the resolver.
|
||||
_rt = _resolve_review_runtime(agent)
|
||||
_routed = bool(_rt.get("routed"))
|
||||
# skip_memory=True keeps the review fork from
|
||||
# touching external memory plugins (honcho, mem0,
|
||||
# supermemory, etc.). Without it, the fork's
|
||||
|
|
@ -519,14 +639,14 @@ def _run_review_in_thread(
|
|||
# in the request body — Anthropic's cache key includes it.
|
||||
# (The runtime whitelist below still restricts dispatch.)
|
||||
review_agent = AIAgent(
|
||||
model=agent.model,
|
||||
model=_rt.get("model") or agent.model,
|
||||
max_iterations=16,
|
||||
quiet_mode=True,
|
||||
platform=agent.platform,
|
||||
provider=agent.provider,
|
||||
api_mode=_parent_api_mode,
|
||||
base_url=_parent_runtime.get("base_url") or None,
|
||||
api_key=_parent_runtime.get("api_key") or None,
|
||||
provider=_rt.get("provider") or agent.provider,
|
||||
api_mode=_rt.get("api_mode"),
|
||||
base_url=_rt.get("base_url") or None,
|
||||
api_key=_rt.get("api_key") or None,
|
||||
credential_pool=getattr(agent, "_credential_pool", None),
|
||||
parent_session_id=agent.session_id,
|
||||
enabled_toolsets=getattr(agent, "enabled_toolsets", None),
|
||||
|
|
@ -565,15 +685,20 @@ def _run_review_in_thread(
|
|||
# issue #25322 and PR #17276 for the full analysis +
|
||||
# measured impact (~26% end-to-end cost reduction on
|
||||
# Sonnet 4.5).
|
||||
review_agent._cached_system_prompt = agent._cached_system_prompt
|
||||
# Defensive: pin session_start + session_id to the
|
||||
# parent's so any code path that re-renders parts of
|
||||
# the system prompt (compression, plugin hooks) still
|
||||
# produces byte-identical output. The cached-prompt
|
||||
# assignment above already short-circuits the normal
|
||||
# rebuild path, but these pins guarantee parity even
|
||||
# if a future code path bypasses the cache.
|
||||
review_agent.session_start = agent.session_start
|
||||
# Share the parent's warm cached system prompt ONLY when the review
|
||||
# runs on the SAME model (not routed). When routed to a different
|
||||
# model the parent's cached prompt is for the wrong model/cache key
|
||||
# and would miss anyway, so let the routed fork build its own.
|
||||
if not _routed:
|
||||
review_agent._cached_system_prompt = agent._cached_system_prompt
|
||||
# Defensive: pin session_start + session_id to the
|
||||
# parent's so any code path that re-renders parts of
|
||||
# the system prompt (compression, plugin hooks) still
|
||||
# produces byte-identical output. The cached-prompt
|
||||
# assignment above already short-circuits the normal
|
||||
# rebuild path, but these pins guarantee parity even
|
||||
# if a future code path bypasses the cache.
|
||||
review_agent.session_start = agent.session_start
|
||||
review_agent.session_id = agent.session_id
|
||||
# The fork shares the parent's live session_id (pinned above for
|
||||
# prefix-cache parity). It is single-lifecycle and calls close()
|
||||
|
|
@ -615,6 +740,13 @@ def _run_review_in_thread(
|
|||
),
|
||||
)
|
||||
try:
|
||||
# Routed to a different model -> replay a digest (cache is cold
|
||||
# on that model anyway, so minimise cold-written tokens). Same
|
||||
# model -> replay the full snapshot (warm cache reads).
|
||||
_review_history = (
|
||||
_digest_history(messages_snapshot) if _routed
|
||||
else messages_snapshot
|
||||
)
|
||||
review_agent.run_conversation(
|
||||
user_message=(
|
||||
prompt
|
||||
|
|
@ -622,7 +754,7 @@ def _run_review_in_thread(
|
|||
"management tools. Other tools will be denied "
|
||||
"at runtime — do not attempt them."
|
||||
),
|
||||
conversation_history=messages_snapshot,
|
||||
conversation_history=_review_history,
|
||||
)
|
||||
finally:
|
||||
clear_thread_tool_whitelist()
|
||||
|
|
|
|||
|
|
@ -1535,6 +1535,25 @@ DEFAULT_CONFIG = {
|
|||
"timeout": 60,
|
||||
"extra_body": {},
|
||||
},
|
||||
# Background review — the post-turn self-improvement fork that decides
|
||||
# whether to save a memory / patch a skill. "auto" (default) = run on
|
||||
# the main chat model, replaying the full conversation, which is already
|
||||
# warm in the prompt cache (cheap cache reads) — unchanged, optimal.
|
||||
# Set provider/model to a cheaper model (e.g. openrouter
|
||||
# google/gemini-3-flash-preview) to run the review there for ~3-5x lower
|
||||
# cost. A different model can't reuse the main prompt cache anyway, so
|
||||
# the fork automatically replays a compact digest instead of the full
|
||||
# transcript when routed (minimises the cold-write). Same model = full
|
||||
# replay; different model = digest. Quality holds (memory capture
|
||||
# identical, skill near-identical in benchmarks).
|
||||
"background_review": {
|
||||
"provider": "auto",
|
||||
"model": "",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 120,
|
||||
"extra_body": {},
|
||||
},
|
||||
},
|
||||
|
||||
"display": {
|
||||
|
|
|
|||
138
tests/run_agent/test_background_review_cost_controls.py
Normal file
138
tests/run_agent/test_background_review_cost_controls.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
"""Unit coverage for the background-review aux-model selector + routed digest.
|
||||
|
||||
Covers the two behaviors this change adds:
|
||||
• _resolve_review_runtime — auto/same-model → not routed (main model, warm
|
||||
cache); a configured different model → routed with resolved credentials.
|
||||
• _digest_history — compact replay used ONLY on the routed path (recent tail
|
||||
verbatim + a digest of older turns), preserving role alternation.
|
||||
|
||||
Pure-function / config-driven; no live model calls.
|
||||
"""
|
||||
from unittest.mock import patch
|
||||
|
||||
from agent import background_review as br
|
||||
|
||||
|
||||
def _msg(role, content, tool_calls=None):
|
||||
m = {"role": role, "content": content}
|
||||
if tool_calls:
|
||||
m["tool_calls"] = tool_calls
|
||||
return m
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _resolve_review_runtime — the aux-model selector
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _FakeAgent:
|
||||
def __init__(self, provider="openai-codex", model="gpt-5.5"):
|
||||
self.provider = provider
|
||||
self.model = model
|
||||
|
||||
def _current_main_runtime(self):
|
||||
return {
|
||||
"api_key": "parent-key",
|
||||
"base_url": "https://chatgpt.com/backend-api/codex",
|
||||
"api_mode": "codex_app_server",
|
||||
}
|
||||
|
||||
|
||||
def test_routing_auto_inherits_parent_and_downgrades_codex_app_server():
|
||||
agent = _FakeAgent()
|
||||
cfg = {"auxiliary": {"background_review": {"provider": "auto", "model": ""}}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
rt = br._resolve_review_runtime(agent)
|
||||
assert rt["routed"] is False
|
||||
assert rt["provider"] == "openai-codex"
|
||||
assert rt["model"] == "gpt-5.5"
|
||||
assert rt["api_mode"] == "codex_responses" # downgraded so agent-loop tools dispatch
|
||||
|
||||
|
||||
def test_routing_to_different_model_marks_routed_and_resolves_credentials():
|
||||
agent = _FakeAgent()
|
||||
cfg = {"auxiliary": {"background_review": {
|
||||
"provider": "openrouter", "model": "google/gemini-3-flash-preview",
|
||||
}}}
|
||||
fake_rp = {
|
||||
"provider": "openrouter", "api_key": "or-key",
|
||||
"base_url": "https://openrouter.ai/api/v1", "api_mode": "chat_completions",
|
||||
}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg), \
|
||||
patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value=fake_rp):
|
||||
rt = br._resolve_review_runtime(agent)
|
||||
assert rt["routed"] is True
|
||||
assert rt["provider"] == "openrouter"
|
||||
assert rt["model"] == "google/gemini-3-flash-preview"
|
||||
assert rt["api_key"] == "or-key"
|
||||
|
||||
|
||||
def test_routing_same_model_as_parent_is_not_routed():
|
||||
agent = _FakeAgent(provider="openrouter", model="anthropic/claude-opus-4.8")
|
||||
cfg = {"auxiliary": {"background_review": {
|
||||
"provider": "openrouter", "model": "anthropic/claude-opus-4.8",
|
||||
}}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg):
|
||||
rt = br._resolve_review_runtime(agent)
|
||||
assert rt["routed"] is False # same model/provider → keep full-replay path
|
||||
|
||||
|
||||
def test_routing_resolution_failure_falls_back_to_parent():
|
||||
agent = _FakeAgent()
|
||||
cfg = {"auxiliary": {"background_review": {
|
||||
"provider": "openrouter", "model": "google/gemini-3-flash-preview",
|
||||
}}}
|
||||
with patch("hermes_cli.config.load_config", return_value=cfg), \
|
||||
patch("hermes_cli.runtime_provider.resolve_runtime_provider",
|
||||
side_effect=RuntimeError("boom")):
|
||||
rt = br._resolve_review_runtime(agent)
|
||||
assert rt["routed"] is False
|
||||
assert rt["provider"] == "openai-codex"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _digest_history — routed-path compact replay
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_digest_under_tail_returns_full():
|
||||
msgs = [_msg("user", "hi"), _msg("assistant", "hello")]
|
||||
assert br._digest_history(msgs, tail=24) == msgs
|
||||
|
||||
|
||||
def test_digest_collapses_old_keeps_tail_verbatim():
|
||||
msgs = []
|
||||
for i in range(60):
|
||||
msgs.append(_msg("user", f"u{i} " + "x" * 50))
|
||||
msgs.append(_msg("assistant", f"a{i} " + "y" * 50))
|
||||
out = br._digest_history(msgs, tail=10)
|
||||
# First message is the synthetic digest (user role → alternation preserved).
|
||||
assert out[0]["role"] == "user"
|
||||
assert out[0]["content"].startswith("[Earlier conversation digest")
|
||||
# Recent tail preserved verbatim.
|
||||
assert out[-1] == msgs[-1]
|
||||
assert len(out) == 11 # 1 digest + 10 tail
|
||||
|
||||
|
||||
def test_digest_does_not_open_tail_on_a_tool_message():
|
||||
msgs = []
|
||||
for i in range(40):
|
||||
msgs.append(_msg("user", "u" + "x" * 50))
|
||||
msgs.append(_msg("assistant", "", tool_calls=[
|
||||
{"function": {"name": "terminal", "arguments": "{}"}}]))
|
||||
msgs.append({"role": "tool", "content": "result " + "w" * 50})
|
||||
out = br._digest_history(msgs, tail=2)
|
||||
# The verbatim tail (after the digest) must not begin on a bare tool message.
|
||||
assert out[1]["role"] != "tool"
|
||||
|
||||
|
||||
def test_digest_records_tool_names_in_arc():
|
||||
old = [
|
||||
_msg("user", "do the thing"),
|
||||
_msg("assistant", "", tool_calls=[
|
||||
{"function": {"name": "skill_view", "arguments": "{}"}},
|
||||
{"function": {"name": "patch", "arguments": "{}"}}]),
|
||||
]
|
||||
msgs = old + [_msg("user", f"tail{i}") for i in range(30)]
|
||||
out = br._digest_history(msgs, tail=10)
|
||||
digest = out[0]["content"]
|
||||
assert "USER: do the thing" in digest
|
||||
assert "tools: skill_view, patch" in digest
|
||||
|
|
@ -270,6 +270,31 @@ display:
|
|||
> writes to your memory/skill stores, are unaffected by this setting. Set it
|
||||
> per-platform via `display.platforms.<platform>.memory_notifications`.
|
||||
|
||||
## Running the review on a cheaper model (`auxiliary.background_review`)
|
||||
|
||||
The review runs on your **main chat model** by default, replaying the
|
||||
conversation — which is already warm in the prompt cache, so it's cheap cache
|
||||
reads. On an expensive main model you can run the review on a cheaper model
|
||||
instead:
|
||||
|
||||
```yaml
|
||||
auxiliary:
|
||||
background_review:
|
||||
provider: openrouter
|
||||
model: google/gemini-3-flash-preview # auto (default) = main chat model
|
||||
```
|
||||
|
||||
When you point it at a model **different** from your main one, the review runs
|
||||
there for substantially lower cost (~3–5× in benchmarks). Because a different
|
||||
model can't reuse your main model's prompt cache anyway, the fork automatically
|
||||
replays a compact **digest** of the conversation (recent turns verbatim + a
|
||||
summary of older ones) rather than the full transcript — minimizing what it
|
||||
writes to the new cache. Capture holds: in testing, memory capture was
|
||||
identical and skill capture near-identical to the main-model review.
|
||||
|
||||
Leave it at `auto` (or set it to your main model) and nothing changes — the
|
||||
review keeps running on the main model with the full warm-cache replay.
|
||||
|
||||
## Controlling skill writes (`skills.write_approval`)
|
||||
|
||||
Skills use the same on/off gate, but the review UX differs because a
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue