From c3a09f78352c70cf89f10ebc66a4faaf1a27e6d4 Mon Sep 17 00:00:00 2001 From: Ziliang Peng Date: Wed, 20 May 2026 18:10:27 -0700 Subject: [PATCH] fix(background_review): propagate parent toolset config to keep tools[] cache-stable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary The background skill/memory-review fork constructed a child `AIAgent` without propagating `enabled_toolsets` / `disabled_toolsets` from the parent. When the parent narrowed its toolset (via `hermes tools disable` or `config.yaml`), the fork's default `enabled_toolsets=None` expanded to "all registered tools" — and the fork's outbound request body sent a wider `tools[]` array than the parent's main-turn request. Anthropic's prompt-cache key includes the `tools[]` array byte-for-byte, so this divergence forked the cache lineage on every nudge and forced a full prefix rewrite. On a captured ~4 hour Claude-via-Hermes session this cost roughly 4.3 M cache-write tokens — about half of those attributable to the per-nudge alternation between the main turn's narrowed `tools[]` and the review fork's wider `tools[]`. ## Goal Extend the byte-stability invariant established by PR #17276 (which fixed `system`) to the `tools[]` slot of the request body, so the review fork's outbound request hits the parent's warmed Anthropic prefix cache regardless of how the parent's toolset is configured. ## Implementation Two-line change in `agent/background_review.py`: pass `enabled_toolsets=getattr(agent, "enabled_toolsets", None)` and the matching `disabled_toolsets` kwarg into the `AIAgent(...)` call inside `_spawn_background_review`. Adds an explanatory block comment that calls out the cache-key dependency and the relationship to PR #17276. The post-construction runtime whitelist (`set_thread_tool_whitelist({memory, skills})`) is untouched — it still gates which tools the model is allowed to *dispatch*. This change aligns only what the request body *transmits*, not what the review is allowed to do, so the safety contract from issue #15204 remains intact. ## Testing - `tests/run_agent/test_background_review_cache_parity.py`: new `test_review_fork_inherits_parent_toolset_config` asserts the parent's `enabled_toolsets` and `disabled_toolsets` reach the review-fork constructor as kwargs. - `tests/run_agent/test_background_review_toolset_restriction.py`: the existing `test_background_review_does_not_narrow_toolset_schema` was inverted (its old "must NOT pass enabled_toolsets" rule was built on the assumption that the parent always ran with the registry default — wrong in practice when the parent is narrowed). Renamed to `test_background_review_matches_parent_toolset_config` and updated to assert the parent's value propagates verbatim. - Verified the new positive test fails without the fix and passes with it. - Full suite for `test_background_review*`: ``` $ python -m pytest tests/run_agent/test_background_review.py \ tests/run_agent/test_background_review_summary.py \ tests/run_agent/test_background_review_toolset_restriction.py \ tests/run_agent/test_background_review_cache_parity.py -q 18 passed in 1.85s ``` ## Scope - `agent/background_review.py`: 2 added kwargs + explanatory comment. - Two test files: one new positive test, one inverted existing test. - No production code paths outside the review fork; no schema changes; no public-API changes. Refs: ziliangpeng/hermes-agent#1 (root-cause analysis with wire-level cache-write measurements). Extends PR #17276's `system`-bytes invariant to the `tools[]` slot. --- agent/background_review.py | 20 +++++ .../test_background_review_cache_parity.py | 77 +++++++++++++++++++ ...t_background_review_toolset_restriction.py | 48 +++++++++--- 3 files changed, 135 insertions(+), 10 deletions(-) diff --git a/agent/background_review.py b/agent/background_review.py index 5488da08de3..d425d6c08e4 100644 --- a/agent/background_review.py +++ b/agent/background_review.py @@ -390,6 +390,24 @@ def _run_review_in_thread( # parent below so memory(action="add") writes from # the review still land on disk; the review just # has zero side effects on external providers. + # Inherit the parent's toolset configuration so the review + # fork's outbound request body has byte-identical ``tools[]`` + # with the parent's last main-turn request. Without this, + # ``enabled_toolsets=None`` defaults to "all registered tools" + # and the fork transmits every tool descriptor (including any + # the user has disabled via ``hermes tools disable``), while + # the parent transmits only its narrower configured set — + # making the two requests diverge in ``tools[]`` even though + # they share ``messages[0..N]`` and ``system`` byte-for-byte. + # Anthropic's prompt-cache key includes ``tools[]``, so any + # divergence forks the cache lineage and forces a full + # prefix rewrite (~100-200K tokens per turn for long convs). + # The post-construction whitelist (``set_thread_tool_whitelist`` + # below) still restricts which tools the model is allowed + # to dispatch — this change only aligns what the request + # body transmits, not what the review is allowed to do. + # This extends the byte-stability invariant established by + # PR #17276 (which fixed ``system``) to ``tools[]``. review_agent = AIAgent( model=agent.model, max_iterations=16, @@ -401,6 +419,8 @@ def _run_review_in_thread( api_key=_parent_runtime.get("api_key") or None, credential_pool=getattr(agent, "_credential_pool", None), parent_session_id=agent.session_id, + enabled_toolsets=getattr(agent, "enabled_toolsets", None), + disabled_toolsets=getattr(agent, "disabled_toolsets", None), skip_memory=True, ) review_agent._memory_write_origin = "background_review" diff --git a/tests/run_agent/test_background_review_cache_parity.py b/tests/run_agent/test_background_review_cache_parity.py index ac91cf75f7a..74b29028627 100644 --- a/tests/run_agent/test_background_review_cache_parity.py +++ b/tests/run_agent/test_background_review_cache_parity.py @@ -38,6 +38,13 @@ def _make_agent_stub(agent_cls): agent._MEMORY_REVIEW_PROMPT = "review memory" agent._SKILL_REVIEW_PROMPT = "review skills" agent._COMBINED_REVIEW_PROMPT = "review both" + # Parent's toolset configuration — must be propagated to the review + # fork so ``tools[]`` matches byte-for-byte. Without these set on the + # stub, ``getattr(agent, ..., None)`` would return None on both sides + # and the test wouldn't catch a regression where the fork is built + # without the kwargs at all. + agent.enabled_toolsets = ["memory", "skills", "terminal"] + agent.disabled_toolsets = ["spotify", "feishu_doc"] return agent @@ -183,3 +190,73 @@ def test_review_fork_pins_session_start_and_session_id(): "Review fork did not inherit parent's session_id — " "system-prompt rebuild paths would diverge." ) + + +def test_review_fork_inherits_parent_toolset_config(): + """The review fork must receive ``enabled_toolsets`` / ``disabled_toolsets`` + from the parent so the outbound request body's ``tools[]`` field matches + byte-for-byte. + + Without this, ``enabled_toolsets=None`` defaults to "all registered tools" + and the fork sends every tool descriptor (e.g. Spotify, Feishu, video) + even when the parent disabled them via ``hermes tools disable``. Anthropic's + prompt cache keys on the byte-exact ``tools[]`` array, so divergence here + forks the cache lineage and forces a full prefix rewrite per nudge + (~100-200 K cache-write tokens for long conversations). + + This is the same byte-stability invariant as + ``test_review_fork_inherits_parent_cached_system_prompt`` but for the + ``tools[]`` slot of the request body, not the ``system`` slot. + """ + import run_agent + + agent = _make_agent_stub(run_agent.AIAgent) + + captured = {} + + class _Recorder: + def __init__(self, *args, **kwargs): + captured["enabled_toolsets"] = kwargs.get("enabled_toolsets") + captured["disabled_toolsets"] = kwargs.get("disabled_toolsets") + # Minimal post-init attrs the surrounding code touches. + self._cached_system_prompt = None + self._memory_write_origin = None + self._memory_write_context = None + self._memory_store = None + self._memory_enabled = None + self._user_profile_enabled = None + self._memory_nudge_interval = None + self._skill_nudge_interval = None + self.suppress_status_output = None + self.session_start = None + self.session_id = None + + def run_conversation(self, *args, **kwargs): + raise RuntimeError("stop after recording — don't actually call the API") + + def shutdown_memory_provider(self): + pass + + def close(self): + pass + + with patch.object(run_agent, "AIAgent", _Recorder), \ + patch("threading.Thread", _SyncThread): + agent._spawn_background_review( + messages_snapshot=[], + review_memory=True, + review_skills=False, + ) + + assert captured.get("enabled_toolsets") == agent.enabled_toolsets, ( + f"Review fork did not receive parent's enabled_toolsets. " + f"Got {captured.get('enabled_toolsets')!r}, expected {agent.enabled_toolsets!r}. " + "This causes ``tools[]`` to diverge between main turns and review nudges, " + "breaking Anthropic prompt-cache parity." + ) + assert captured.get("disabled_toolsets") == agent.disabled_toolsets, ( + f"Review fork did not receive parent's disabled_toolsets. " + f"Got {captured.get('disabled_toolsets')!r}, expected {agent.disabled_toolsets!r}. " + "This causes ``tools[]`` to diverge between main turns and review nudges, " + "breaking Anthropic prompt-cache parity." + ) diff --git a/tests/run_agent/test_background_review_toolset_restriction.py b/tests/run_agent/test_background_review_toolset_restriction.py index 7eea665b86f..5282c92e1ba 100644 --- a/tests/run_agent/test_background_review_toolset_restriction.py +++ b/tests/run_agent/test_background_review_toolset_restriction.py @@ -38,6 +38,13 @@ def _make_agent_stub(agent_cls): agent._MEMORY_REVIEW_PROMPT = "review memory" agent._SKILL_REVIEW_PROMPT = "review skills" agent._COMBINED_REVIEW_PROMPT = "review both" + # Parent's toolset configuration must propagate to the review fork + # so the request body's ``tools[]`` array is byte-identical. Without + # propagation, ``enabled_toolsets=None`` expands to "all registered + # tools" — which diverges from a parent that narrowed its set via + # ``hermes tools disable`` or config. + agent.enabled_toolsets = ["memory", "skills", "terminal"] + agent.disabled_toolsets = ["spotify", "feishu_doc"] return agent @@ -52,12 +59,25 @@ class _SyncThread: self._target() -def test_background_review_does_not_narrow_toolset_schema(): - """The review fork must NOT pass enabled_toolsets to AIAgent. +def test_background_review_matches_parent_toolset_config(): + """The review fork must propagate the parent's ``enabled_toolsets`` to AIAgent. - Narrowing the schema diverges the ``tools`` cache key from the parent's, - which sits above ``system`` in Anthropic's cache hierarchy and forces a - full prefix-cache miss on every review (see #25322, PR #17276). + Earlier guidance (the old "do NOT pass enabled_toolsets" rule) assumed the + parent always ran with the registry default. In practice the parent is + often narrowed via ``hermes tools disable`` or ``config.yaml``, and + leaving ``enabled_toolsets=None`` on the fork makes it expand to ALL + registered tools — which is what *diverges* from the parent and breaks + Anthropic's prefix cache key on ``tools[]``. + + The correct rule is symmetric: whatever the parent has, the fork + must have the same. When the parent's value is ``None``, the fork's + must also be ``None`` (and they'll both expand identically). When + the parent narrows, the fork inherits the narrowed set verbatim. + + (Schema-level alignment with the parent + post-construction runtime + whitelist remain the safety contract for #15204 — the whitelist + blocks dispatch of non-memory/skill tools regardless of what schemas + are sent over the wire.) """ import run_agent @@ -66,6 +86,7 @@ def test_background_review_does_not_narrow_toolset_schema(): def _capture_init(self, *args, **kwargs): captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET") + captured["disabled_toolsets"] = kwargs.get("disabled_toolsets", "UNSET") raise RuntimeError("stop after capturing init args") with patch.object(run_agent.AIAgent, "__init__", _capture_init), \ @@ -77,11 +98,18 @@ def test_background_review_does_not_narrow_toolset_schema(): ) assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called" - # The kwarg must be absent — letting AIAgent inherit the default full - # toolset so the schema bytes match the parent's. - assert captured["enabled_toolsets"] == "UNSET", ( - f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), " - "which breaks prefix-cache parity with the parent." + # The kwargs must equal the parent's so the ``tools[]`` request-body + # bytes match the parent's last main-turn request. + assert captured["enabled_toolsets"] == agent.enabled_toolsets, ( + f"Review fork did not propagate parent's enabled_toolsets " + f"(got {captured['enabled_toolsets']!r}, expected {agent.enabled_toolsets!r}). " + "This causes ``tools[]`` to diverge from the parent — Anthropic's " + "prompt-cache key includes ``tools[]``, so divergence forks the " + "cache lineage and forces a full prefix rewrite per nudge." + ) + assert captured["disabled_toolsets"] == agent.disabled_toolsets, ( + f"Review fork did not propagate parent's disabled_toolsets " + f"(got {captured['disabled_toolsets']!r}, expected {agent.disabled_toolsets!r})." )