From c3a09f78352c70cf89f10ebc66a4faaf1a27e6d4 Mon Sep 17 00:00:00 2001
From: Ziliang Peng <ziliangdotme@gmail.com>
Date: Wed, 20 May 2026 18:10:27 -0700
Subject: [PATCH] fix(background_review): propagate parent toolset config to
 keep tools[] cache-stable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

The background skill/memory-review fork constructed a child `AIAgent`
without propagating `enabled_toolsets` / `disabled_toolsets` from the
parent. When the parent narrowed its toolset (via `hermes tools
disable` or `config.yaml`), the fork's default `enabled_toolsets=None`
expanded to "all registered tools" — and the fork's outbound request
body sent a wider `tools[]` array than the parent's main-turn request.

Anthropic's prompt-cache key includes the `tools[]` array byte-for-byte,
so this divergence forked the cache lineage on every nudge and forced a
full prefix rewrite. On a captured ~4 hour Claude-via-Hermes session
this cost roughly 4.3 M cache-write tokens — about half of those
attributable to the per-nudge alternation between the main turn's
narrowed `tools[]` and the review fork's wider `tools[]`.

## Goal

Extend the byte-stability invariant established by PR #17276 (which
fixed `system`) to the `tools[]` slot of the request body, so the
review fork's outbound request hits the parent's warmed Anthropic
prefix cache regardless of how the parent's toolset is configured.

## Implementation

Two-line change in `agent/background_review.py`: pass
`enabled_toolsets=getattr(agent, "enabled_toolsets", None)` and the
matching `disabled_toolsets` kwarg into the `AIAgent(...)` call inside
`_spawn_background_review`. Adds an explanatory block comment that
calls out the cache-key dependency and the relationship to PR #17276.

The post-construction runtime whitelist
(`set_thread_tool_whitelist({memory, skills})`) is untouched — it
still gates which tools the model is allowed to *dispatch*. This
change aligns only what the request body *transmits*, not what the
review is allowed to do, so the safety contract from issue #15204
remains intact.

## Testing

- `tests/run_agent/test_background_review_cache_parity.py`: new
  `test_review_fork_inherits_parent_toolset_config` asserts the
  parent's `enabled_toolsets` and `disabled_toolsets` reach the
  review-fork constructor as kwargs.
- `tests/run_agent/test_background_review_toolset_restriction.py`:
  the existing `test_background_review_does_not_narrow_toolset_schema`
  was inverted (its old "must NOT pass enabled_toolsets" rule was
  built on the assumption that the parent always ran with the
  registry default — wrong in practice when the parent is narrowed).
  Renamed to `test_background_review_matches_parent_toolset_config`
  and updated to assert the parent's value propagates verbatim.
- Verified the new positive test fails without the fix and passes
  with it.
- Full suite for `test_background_review*`:

  ```
  $ python -m pytest tests/run_agent/test_background_review.py \
                     tests/run_agent/test_background_review_summary.py \
                     tests/run_agent/test_background_review_toolset_restriction.py \
                     tests/run_agent/test_background_review_cache_parity.py -q
  18 passed in 1.85s
  ```

## Scope

- `agent/background_review.py`: 2 added kwargs + explanatory comment.
- Two test files: one new positive test, one inverted existing test.
- No production code paths outside the review fork; no schema changes;
  no public-API changes.

Refs: ziliangpeng/hermes-agent#1 (root-cause analysis with wire-level
cache-write measurements). Extends PR #17276's `system`-bytes
invariant to the `tools[]` slot.
---
 agent/background_review.py                    | 20 +++++
 .../test_background_review_cache_parity.py    | 77 +++++++++++++++++++
 ...t_background_review_toolset_restriction.py | 48 +++++++++---
 3 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/agent/background_review.py b/agent/background_review.py
index 5488da08de3..d425d6c08e4 100644
--- a/agent/background_review.py
+++ b/agent/background_review.py
@@ -390,6 +390,24 @@ def _run_review_in_thread(
             # parent below so memory(action="add") writes from
             # the review still land on disk; the review just
             # has zero side effects on external providers.
+            # Inherit the parent's toolset configuration so the review
+            # fork's outbound request body has byte-identical ``tools[]``
+            # with the parent's last main-turn request. Without this,
+            # ``enabled_toolsets=None`` defaults to "all registered tools"
+            # and the fork transmits every tool descriptor (including any
+            # the user has disabled via ``hermes tools disable``), while
+            # the parent transmits only its narrower configured set —
+            # making the two requests diverge in ``tools[]`` even though
+            # they share ``messages[0..N]`` and ``system`` byte-for-byte.
+            # Anthropic's prompt-cache key includes ``tools[]``, so any
+            # divergence forks the cache lineage and forces a full
+            # prefix rewrite (~100-200K tokens per turn for long convs).
+            # The post-construction whitelist (``set_thread_tool_whitelist``
+            # below) still restricts which tools the model is allowed
+            # to dispatch — this change only aligns what the request
+            # body transmits, not what the review is allowed to do.
+            # This extends the byte-stability invariant established by
+            # PR #17276 (which fixed ``system``) to ``tools[]``.
             review_agent = AIAgent(
                 model=agent.model,
                 max_iterations=16,
@@ -401,6 +419,8 @@ def _run_review_in_thread(
                 api_key=_parent_runtime.get("api_key") or None,
                 credential_pool=getattr(agent, "_credential_pool", None),
                 parent_session_id=agent.session_id,
+                enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+                disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                 skip_memory=True,
             )
             review_agent._memory_write_origin = "background_review"
diff --git a/tests/run_agent/test_background_review_cache_parity.py b/tests/run_agent/test_background_review_cache_parity.py
index ac91cf75f7a..74b29028627 100644
--- a/tests/run_agent/test_background_review_cache_parity.py
+++ b/tests/run_agent/test_background_review_cache_parity.py
@@ -38,6 +38,13 @@ def _make_agent_stub(agent_cls):
     agent._MEMORY_REVIEW_PROMPT = "review memory"
     agent._SKILL_REVIEW_PROMPT = "review skills"
     agent._COMBINED_REVIEW_PROMPT = "review both"
+    # Parent's toolset configuration — must be propagated to the review
+    # fork so ``tools[]`` matches byte-for-byte. Without these set on the
+    # stub, ``getattr(agent, ..., None)`` would return None on both sides
+    # and the test wouldn't catch a regression where the fork is built
+    # without the kwargs at all.
+    agent.enabled_toolsets = ["memory", "skills", "terminal"]
+    agent.disabled_toolsets = ["spotify", "feishu_doc"]
     return agent
 
 
@@ -183,3 +190,73 @@ def test_review_fork_pins_session_start_and_session_id():
         "Review fork did not inherit parent's session_id — "
         "system-prompt rebuild paths would diverge."
     )
+
+
+def test_review_fork_inherits_parent_toolset_config():
+    """The review fork must receive ``enabled_toolsets`` / ``disabled_toolsets``
+    from the parent so the outbound request body's ``tools[]`` field matches
+    byte-for-byte.
+
+    Without this, ``enabled_toolsets=None`` defaults to "all registered tools"
+    and the fork sends every tool descriptor (e.g. Spotify, Feishu, video)
+    even when the parent disabled them via ``hermes tools disable``. Anthropic's
+    prompt cache keys on the byte-exact ``tools[]`` array, so divergence here
+    forks the cache lineage and forces a full prefix rewrite per nudge
+    (~100-200 K cache-write tokens for long conversations).
+
+    This is the same byte-stability invariant as
+    ``test_review_fork_inherits_parent_cached_system_prompt`` but for the
+    ``tools[]`` slot of the request body, not the ``system`` slot.
+    """
+    import run_agent
+
+    agent = _make_agent_stub(run_agent.AIAgent)
+
+    captured = {}
+
+    class _Recorder:
+        def __init__(self, *args, **kwargs):
+            captured["enabled_toolsets"] = kwargs.get("enabled_toolsets")
+            captured["disabled_toolsets"] = kwargs.get("disabled_toolsets")
+            # Minimal post-init attrs the surrounding code touches.
+            self._cached_system_prompt = None
+            self._memory_write_origin = None
+            self._memory_write_context = None
+            self._memory_store = None
+            self._memory_enabled = None
+            self._user_profile_enabled = None
+            self._memory_nudge_interval = None
+            self._skill_nudge_interval = None
+            self.suppress_status_output = None
+            self.session_start = None
+            self.session_id = None
+
+        def run_conversation(self, *args, **kwargs):
+            raise RuntimeError("stop after recording — don't actually call the API")
+
+        def shutdown_memory_provider(self):
+            pass
+
+        def close(self):
+            pass
+
+    with patch.object(run_agent, "AIAgent", _Recorder), \
+         patch("threading.Thread", _SyncThread):
+        agent._spawn_background_review(
+            messages_snapshot=[],
+            review_memory=True,
+            review_skills=False,
+        )
+
+    assert captured.get("enabled_toolsets") == agent.enabled_toolsets, (
+        f"Review fork did not receive parent's enabled_toolsets. "
+        f"Got {captured.get('enabled_toolsets')!r}, expected {agent.enabled_toolsets!r}. "
+        "This causes ``tools[]`` to diverge between main turns and review nudges, "
+        "breaking Anthropic prompt-cache parity."
+    )
+    assert captured.get("disabled_toolsets") == agent.disabled_toolsets, (
+        f"Review fork did not receive parent's disabled_toolsets. "
+        f"Got {captured.get('disabled_toolsets')!r}, expected {agent.disabled_toolsets!r}. "
+        "This causes ``tools[]`` to diverge between main turns and review nudges, "
+        "breaking Anthropic prompt-cache parity."
+    )
diff --git a/tests/run_agent/test_background_review_toolset_restriction.py b/tests/run_agent/test_background_review_toolset_restriction.py
index 7eea665b86f..5282c92e1ba 100644
--- a/tests/run_agent/test_background_review_toolset_restriction.py
+++ b/tests/run_agent/test_background_review_toolset_restriction.py
@@ -38,6 +38,13 @@ def _make_agent_stub(agent_cls):
     agent._MEMORY_REVIEW_PROMPT = "review memory"
     agent._SKILL_REVIEW_PROMPT = "review skills"
     agent._COMBINED_REVIEW_PROMPT = "review both"
+    # Parent's toolset configuration must propagate to the review fork
+    # so the request body's ``tools[]`` array is byte-identical. Without
+    # propagation, ``enabled_toolsets=None`` expands to "all registered
+    # tools" — which diverges from a parent that narrowed its set via
+    # ``hermes tools disable`` or config.
+    agent.enabled_toolsets = ["memory", "skills", "terminal"]
+    agent.disabled_toolsets = ["spotify", "feishu_doc"]
     return agent
 
 
@@ -52,12 +59,25 @@ class _SyncThread:
             self._target()
 
 
-def test_background_review_does_not_narrow_toolset_schema():
-    """The review fork must NOT pass enabled_toolsets to AIAgent.
+def test_background_review_matches_parent_toolset_config():
+    """The review fork must propagate the parent's ``enabled_toolsets`` to AIAgent.
 
-    Narrowing the schema diverges the ``tools`` cache key from the parent's,
-    which sits above ``system`` in Anthropic's cache hierarchy and forces a
-    full prefix-cache miss on every review (see #25322, PR #17276).
+    Earlier guidance (the old "do NOT pass enabled_toolsets" rule) assumed the
+    parent always ran with the registry default. In practice the parent is
+    often narrowed via ``hermes tools disable`` or ``config.yaml``, and
+    leaving ``enabled_toolsets=None`` on the fork makes it expand to ALL
+    registered tools — which is what *diverges* from the parent and breaks
+    Anthropic's prefix cache key on ``tools[]``.
+
+    The correct rule is symmetric: whatever the parent has, the fork
+    must have the same. When the parent's value is ``None``, the fork's
+    must also be ``None`` (and they'll both expand identically). When
+    the parent narrows, the fork inherits the narrowed set verbatim.
+
+    (Schema-level alignment with the parent + post-construction runtime
+    whitelist remain the safety contract for #15204 — the whitelist
+    blocks dispatch of non-memory/skill tools regardless of what schemas
+    are sent over the wire.)
     """
     import run_agent
 
@@ -66,6 +86,7 @@ def test_background_review_does_not_narrow_toolset_schema():
 
     def _capture_init(self, *args, **kwargs):
         captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET")
+        captured["disabled_toolsets"] = kwargs.get("disabled_toolsets", "UNSET")
         raise RuntimeError("stop after capturing init args")
 
     with patch.object(run_agent.AIAgent, "__init__", _capture_init), \
@@ -77,11 +98,18 @@ def test_background_review_does_not_narrow_toolset_schema():
         )
 
     assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called"
-    # The kwarg must be absent — letting AIAgent inherit the default full
-    # toolset so the schema bytes match the parent's.
-    assert captured["enabled_toolsets"] == "UNSET", (
-        f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), "
-        "which breaks prefix-cache parity with the parent."
+    # The kwargs must equal the parent's so the ``tools[]`` request-body
+    # bytes match the parent's last main-turn request.
+    assert captured["enabled_toolsets"] == agent.enabled_toolsets, (
+        f"Review fork did not propagate parent's enabled_toolsets "
+        f"(got {captured['enabled_toolsets']!r}, expected {agent.enabled_toolsets!r}). "
+        "This causes ``tools[]`` to diverge from the parent — Anthropic's "
+        "prompt-cache key includes ``tools[]``, so divergence forks the "
+        "cache lineage and forces a full prefix rewrite per nudge."
+    )
+    assert captured["disabled_toolsets"] == agent.disabled_toolsets, (
+        f"Review fork did not propagate parent's disabled_toolsets "
+        f"(got {captured['disabled_toolsets']!r}, expected {agent.disabled_toolsets!r})."
     )