mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-04 07:31:58 +00:00
## Summary The background skill/memory-review fork constructed a child `AIAgent` without propagating `enabled_toolsets` / `disabled_toolsets` from the parent. When the parent narrowed its toolset (via `hermes tools disable` or `config.yaml`), the fork's default `enabled_toolsets=None` expanded to "all registered tools" — and the fork's outbound request body sent a wider `tools[]` array than the parent's main-turn request. Anthropic's prompt-cache key includes the `tools[]` array byte-for-byte, so this divergence forked the cache lineage on every nudge and forced a full prefix rewrite. On a captured ~4 hour Claude-via-Hermes session this cost roughly 4.3 M cache-write tokens — about half of those attributable to the per-nudge alternation between the main turn's narrowed `tools[]` and the review fork's wider `tools[]`. ## Goal Extend the byte-stability invariant established by PR #17276 (which fixed `system`) to the `tools[]` slot of the request body, so the review fork's outbound request hits the parent's warmed Anthropic prefix cache regardless of how the parent's toolset is configured. ## Implementation Two-line change in `agent/background_review.py`: pass `enabled_toolsets=getattr(agent, "enabled_toolsets", None)` and the matching `disabled_toolsets` kwarg into the `AIAgent(...)` call inside `_spawn_background_review`. Adds an explanatory block comment that calls out the cache-key dependency and the relationship to PR #17276. The post-construction runtime whitelist (`set_thread_tool_whitelist({memory, skills})`) is untouched — it still gates which tools the model is allowed to *dispatch*. This change aligns only what the request body *transmits*, not what the review is allowed to do, so the safety contract from issue #15204 remains intact. ## Testing - `tests/run_agent/test_background_review_cache_parity.py`: new `test_review_fork_inherits_parent_toolset_config` asserts the parent's `enabled_toolsets` and `disabled_toolsets` reach the review-fork constructor as kwargs. - `tests/run_agent/test_background_review_toolset_restriction.py`: the existing `test_background_review_does_not_narrow_toolset_schema` was inverted (its old "must NOT pass enabled_toolsets" rule was built on the assumption that the parent always ran with the registry default — wrong in practice when the parent is narrowed). Renamed to `test_background_review_matches_parent_toolset_config` and updated to assert the parent's value propagates verbatim. - Verified the new positive test fails without the fix and passes with it. - Full suite for `test_background_review*`: ``` $ python -m pytest tests/run_agent/test_background_review.py \ tests/run_agent/test_background_review_summary.py \ tests/run_agent/test_background_review_toolset_restriction.py \ tests/run_agent/test_background_review_cache_parity.py -q 18 passed in 1.85s ``` ## Scope - `agent/background_review.py`: 2 added kwargs + explanatory comment. - Two test files: one new positive test, one inverted existing test. - No production code paths outside the review fork; no schema changes; no public-API changes. Refs: ziliangpeng/hermes-agent#1 (root-cause analysis with wire-level cache-write measurements). Extends PR #17276's `system`-bytes invariant to the `tools[]` slot.
262 lines
10 KiB
Python
262 lines
10 KiB
Python
"""Tests that the background review fork inherits the parent's cached system prompt.
|
|
|
|
Regression coverage for issue #25322 (and PR #17276's first root cause): the
|
|
background review's outbound HTTP request must carry the same system bytes as
|
|
the parent's so Anthropic/OpenRouter's exact-prefix cache key matches.
|
|
|
|
Without this, every review rebuilds the system prompt from scratch — fresh
|
|
``_hermes_now()`` timestamp, fresh ``session_id``, and a different skills
|
|
prompt under the (former) narrow toolset — and the prefix-cache miss costs
|
|
roughly the full uncached system-prompt cost per nudge (~26% end-to-end on
|
|
Sonnet 4.5 per the contributor's measurement).
|
|
"""
|
|
|
|
from unittest.mock import patch
|
|
|
|
|
|
def _make_agent_stub(agent_cls):
|
|
"""Create a minimal AIAgent-like object with just enough state for _spawn_background_review."""
|
|
agent = object.__new__(agent_cls)
|
|
agent.model = "test-model"
|
|
agent.platform = "test"
|
|
agent.provider = "openai"
|
|
agent.session_id = "sess-123"
|
|
agent.quiet_mode = True
|
|
agent._memory_store = None
|
|
agent._memory_enabled = True
|
|
agent._user_profile_enabled = False
|
|
agent._memory_nudge_interval = 5
|
|
agent._skill_nudge_interval = 5
|
|
agent.background_review_callback = None
|
|
agent.status_callback = None
|
|
agent._cached_system_prompt = (
|
|
"PARENT-SYSTEM-PROMPT-BYTES — must be inherited verbatim "
|
|
"for prefix-cache parity"
|
|
)
|
|
import datetime as _dt
|
|
agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
|
|
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
|
agent._SKILL_REVIEW_PROMPT = "review skills"
|
|
agent._COMBINED_REVIEW_PROMPT = "review both"
|
|
# Parent's toolset configuration — must be propagated to the review
|
|
# fork so ``tools[]`` matches byte-for-byte. Without these set on the
|
|
# stub, ``getattr(agent, ..., None)`` would return None on both sides
|
|
# and the test wouldn't catch a regression where the fork is built
|
|
# without the kwargs at all.
|
|
agent.enabled_toolsets = ["memory", "skills", "terminal"]
|
|
agent.disabled_toolsets = ["spotify", "feishu_doc"]
|
|
return agent
|
|
|
|
|
|
class _SyncThread:
|
|
"""Drop-in replacement for threading.Thread that runs the target inline."""
|
|
|
|
def __init__(self, *, target=None, daemon=None, name=None):
|
|
self._target = target
|
|
|
|
def start(self):
|
|
if self._target:
|
|
self._target()
|
|
|
|
|
|
class _ReviewAgentRecorder:
|
|
"""Stand-in for the review-fork AIAgent that records the prompt assignment."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
self._cached_system_prompt = None
|
|
self._memory_write_origin = None
|
|
self._memory_write_context = None
|
|
self._memory_store = None
|
|
self._memory_enabled = None
|
|
self._user_profile_enabled = None
|
|
self._memory_nudge_interval = None
|
|
self._skill_nudge_interval = None
|
|
self.suppress_status_output = None
|
|
|
|
def run_conversation(self, *args, **kwargs):
|
|
raise RuntimeError("stop after recording state — don't actually call the API")
|
|
|
|
def shutdown_memory_provider(self):
|
|
pass
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
|
|
def test_review_fork_inherits_parent_cached_system_prompt():
|
|
"""The review fork's _cached_system_prompt must equal the parent's byte-for-byte.
|
|
|
|
Anthropic's prefix cache keys on exact bytes; any divergence (timestamp
|
|
minute tick, fresh session_id, narrower skills_prompt) shifts the key
|
|
and forces a full re-cache. Inheriting the parent's cached prompt is
|
|
the cheap, mechanical fix.
|
|
"""
|
|
import run_agent
|
|
|
|
agent = _make_agent_stub(run_agent.AIAgent)
|
|
|
|
captured = {}
|
|
parent_prompt = agent._cached_system_prompt
|
|
|
|
# Hook the assignment site: record what gets put on the review agent.
|
|
real_recorder_init = _ReviewAgentRecorder.__init__
|
|
|
|
def _recorder_init(self, *args, **kwargs):
|
|
real_recorder_init(self, *args, **kwargs)
|
|
# The actual production code assigns _cached_system_prompt AFTER __init__,
|
|
# so we need to capture it on attribute set. Use a property-style sentinel
|
|
# via __setattr__ on this instance.
|
|
|
|
with patch.object(run_agent, "AIAgent", _ReviewAgentRecorder), \
|
|
patch("threading.Thread", _SyncThread):
|
|
# Wrap the recorder's __setattr__ so we can see the _cached_system_prompt
|
|
# write that _spawn_background_review performs after construction.
|
|
orig_setattr = _ReviewAgentRecorder.__setattr__
|
|
|
|
def _spy_setattr(self, name, value):
|
|
if name == "_cached_system_prompt":
|
|
captured["written_prompt"] = value
|
|
orig_setattr(self, name, value)
|
|
|
|
with patch.object(_ReviewAgentRecorder, "__setattr__", _spy_setattr):
|
|
agent._spawn_background_review(
|
|
messages_snapshot=[],
|
|
review_memory=True,
|
|
review_skills=False,
|
|
)
|
|
|
|
assert "written_prompt" in captured, (
|
|
"_spawn_background_review never assigned _cached_system_prompt on the review agent"
|
|
)
|
|
assert captured["written_prompt"] == parent_prompt, (
|
|
f"Review fork's _cached_system_prompt diverged from parent's. "
|
|
f"Got {captured['written_prompt']!r}, expected {parent_prompt!r}. "
|
|
"This breaks Anthropic/OpenRouter prefix-cache parity (#25322)."
|
|
)
|
|
|
|
|
|
def test_review_fork_pins_session_start_and_session_id():
|
|
"""Defensive complement to cached-system-prompt inheritance.
|
|
|
|
Even though ``_cached_system_prompt`` inheritance short-circuits the
|
|
normal rebuild path, pinning ``session_start`` and ``session_id`` to
|
|
the parent's guarantees byte-identical output from any code path that
|
|
re-renders parts of the system prompt (compression, plugin hooks).
|
|
"""
|
|
import run_agent
|
|
|
|
agent = _make_agent_stub(run_agent.AIAgent)
|
|
|
|
captured = {}
|
|
|
|
class _Recorder:
|
|
def __init__(self, *args, **kwargs):
|
|
self._cached_system_prompt = None
|
|
self._memory_write_origin = None
|
|
self._memory_write_context = None
|
|
self._memory_store = None
|
|
self._memory_enabled = None
|
|
self._user_profile_enabled = None
|
|
self._memory_nudge_interval = None
|
|
self._skill_nudge_interval = None
|
|
self.suppress_status_output = None
|
|
self.session_start = None
|
|
self.session_id = None
|
|
|
|
def run_conversation(self, *args, **kwargs):
|
|
captured["session_start"] = self.session_start
|
|
captured["session_id"] = self.session_id
|
|
raise RuntimeError("stop after recording")
|
|
|
|
def shutdown_memory_provider(self):
|
|
pass
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
with patch.object(run_agent, "AIAgent", _Recorder), \
|
|
patch("threading.Thread", _SyncThread):
|
|
agent._spawn_background_review(
|
|
messages_snapshot=[],
|
|
review_memory=True,
|
|
review_skills=False,
|
|
)
|
|
|
|
assert captured.get("session_start") == agent.session_start, (
|
|
"Review fork did not inherit parent's session_start — "
|
|
"system-prompt rebuild paths would diverge."
|
|
)
|
|
assert captured.get("session_id") == agent.session_id, (
|
|
"Review fork did not inherit parent's session_id — "
|
|
"system-prompt rebuild paths would diverge."
|
|
)
|
|
|
|
|
|
def test_review_fork_inherits_parent_toolset_config():
|
|
"""The review fork must receive ``enabled_toolsets`` / ``disabled_toolsets``
|
|
from the parent so the outbound request body's ``tools[]`` field matches
|
|
byte-for-byte.
|
|
|
|
Without this, ``enabled_toolsets=None`` defaults to "all registered tools"
|
|
and the fork sends every tool descriptor (e.g. Spotify, Feishu, video)
|
|
even when the parent disabled them via ``hermes tools disable``. Anthropic's
|
|
prompt cache keys on the byte-exact ``tools[]`` array, so divergence here
|
|
forks the cache lineage and forces a full prefix rewrite per nudge
|
|
(~100-200 K cache-write tokens for long conversations).
|
|
|
|
This is the same byte-stability invariant as
|
|
``test_review_fork_inherits_parent_cached_system_prompt`` but for the
|
|
``tools[]`` slot of the request body, not the ``system`` slot.
|
|
"""
|
|
import run_agent
|
|
|
|
agent = _make_agent_stub(run_agent.AIAgent)
|
|
|
|
captured = {}
|
|
|
|
class _Recorder:
|
|
def __init__(self, *args, **kwargs):
|
|
captured["enabled_toolsets"] = kwargs.get("enabled_toolsets")
|
|
captured["disabled_toolsets"] = kwargs.get("disabled_toolsets")
|
|
# Minimal post-init attrs the surrounding code touches.
|
|
self._cached_system_prompt = None
|
|
self._memory_write_origin = None
|
|
self._memory_write_context = None
|
|
self._memory_store = None
|
|
self._memory_enabled = None
|
|
self._user_profile_enabled = None
|
|
self._memory_nudge_interval = None
|
|
self._skill_nudge_interval = None
|
|
self.suppress_status_output = None
|
|
self.session_start = None
|
|
self.session_id = None
|
|
|
|
def run_conversation(self, *args, **kwargs):
|
|
raise RuntimeError("stop after recording — don't actually call the API")
|
|
|
|
def shutdown_memory_provider(self):
|
|
pass
|
|
|
|
def close(self):
|
|
pass
|
|
|
|
with patch.object(run_agent, "AIAgent", _Recorder), \
|
|
patch("threading.Thread", _SyncThread):
|
|
agent._spawn_background_review(
|
|
messages_snapshot=[],
|
|
review_memory=True,
|
|
review_skills=False,
|
|
)
|
|
|
|
assert captured.get("enabled_toolsets") == agent.enabled_toolsets, (
|
|
f"Review fork did not receive parent's enabled_toolsets. "
|
|
f"Got {captured.get('enabled_toolsets')!r}, expected {agent.enabled_toolsets!r}. "
|
|
"This causes ``tools[]`` to diverge between main turns and review nudges, "
|
|
"breaking Anthropic prompt-cache parity."
|
|
)
|
|
assert captured.get("disabled_toolsets") == agent.disabled_toolsets, (
|
|
f"Review fork did not receive parent's disabled_toolsets. "
|
|
f"Got {captured.get('disabled_toolsets')!r}, expected {agent.disabled_toolsets!r}. "
|
|
"This causes ``tools[]`` to diverge between main turns and review nudges, "
|
|
"breaking Anthropic prompt-cache parity."
|
|
)
|