From 8c6b0c9ecdabd67cb22b34e5c294e3f0aba47bbc Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Wed, 13 May 2026 22:06:31 -0700 Subject: [PATCH] test(memory): cover cache-parity + runtime whitelist on background review fork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test_background_review_does_not_narrow_toolset_schema: review fork must NOT pass enabled_toolsets to AIAgent (full parent schema = matching Anthropic cache key on the 'tools' field). - test_background_review_installs_thread_local_whitelist: the runtime whitelist that replaces schema-level narrowing must contain memory + skills tools and exclude terminal / send_message / delegate_task / web_search / execute_code. - test_review_fork_inherits_parent_cached_system_prompt: new test for PR #17276's first root cause — the fork's _cached_system_prompt must equal the parent's byte-for-byte. - test_review_fork_pins_session_start_and_session_id: defensive belt-and- suspenders for the cached-prompt inheritance. Inverted the original test_background_review_agent_uses_restricted_toolsets (which asserted the schema-level narrowing) — that narrowing was the direct cause of #25322's cache miss, and the runtime whitelist replaces its safety claim without breaking cache parity. Refs #25322, #15204, PR #17276. --- tests/run_agent/test_background_review.py | 3 + .../test_background_review_cache_parity.py | 185 ++++++++++++++++++ ...t_background_review_toolset_restriction.py | 94 ++++++++- 3 files changed, 273 insertions(+), 9 deletions(-) create mode 100644 tests/run_agent/test_background_review_cache_parity.py diff --git a/tests/run_agent/test_background_review.py b/tests/run_agent/test_background_review.py index 8f2a61b7504..2e79b10b346 100644 --- a/tests/run_agent/test_background_review.py +++ b/tests/run_agent/test_background_review.py @@ -20,6 +20,9 @@ def _bare_agent() -> AIAgent: agent._memory_store = object() agent._memory_enabled = True agent._user_profile_enabled = False + agent._cached_system_prompt = "test-cached-system-prompt" + import datetime as _dt + agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0) agent._MEMORY_REVIEW_PROMPT = "review memory" agent._SKILL_REVIEW_PROMPT = "review skills" agent._COMBINED_REVIEW_PROMPT = "review both" diff --git a/tests/run_agent/test_background_review_cache_parity.py b/tests/run_agent/test_background_review_cache_parity.py new file mode 100644 index 00000000000..ac91cf75f7a --- /dev/null +++ b/tests/run_agent/test_background_review_cache_parity.py @@ -0,0 +1,185 @@ +"""Tests that the background review fork inherits the parent's cached system prompt. + +Regression coverage for issue #25322 (and PR #17276's first root cause): the +background review's outbound HTTP request must carry the same system bytes as +the parent's so Anthropic/OpenRouter's exact-prefix cache key matches. + +Without this, every review rebuilds the system prompt from scratch — fresh +``_hermes_now()`` timestamp, fresh ``session_id``, and a different skills +prompt under the (former) narrow toolset — and the prefix-cache miss costs +roughly the full uncached system-prompt cost per nudge (~26% end-to-end on +Sonnet 4.5 per the contributor's measurement). +""" + +from unittest.mock import patch + + +def _make_agent_stub(agent_cls): + """Create a minimal AIAgent-like object with just enough state for _spawn_background_review.""" + agent = object.__new__(agent_cls) + agent.model = "test-model" + agent.platform = "test" + agent.provider = "openai" + agent.session_id = "sess-123" + agent.quiet_mode = True + agent._memory_store = None + agent._memory_enabled = True + agent._user_profile_enabled = False + agent._memory_nudge_interval = 5 + agent._skill_nudge_interval = 5 + agent.background_review_callback = None + agent.status_callback = None + agent._cached_system_prompt = ( + "PARENT-SYSTEM-PROMPT-BYTES — must be inherited verbatim " + "for prefix-cache parity" + ) + import datetime as _dt + agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0) + agent._MEMORY_REVIEW_PROMPT = "review memory" + agent._SKILL_REVIEW_PROMPT = "review skills" + agent._COMBINED_REVIEW_PROMPT = "review both" + return agent + + +class _SyncThread: + """Drop-in replacement for threading.Thread that runs the target inline.""" + + def __init__(self, *, target=None, daemon=None, name=None): + self._target = target + + def start(self): + if self._target: + self._target() + + +class _ReviewAgentRecorder: + """Stand-in for the review-fork AIAgent that records the prompt assignment.""" + + def __init__(self, *args, **kwargs): + self._cached_system_prompt = None + self._memory_write_origin = None + self._memory_write_context = None + self._memory_store = None + self._memory_enabled = None + self._user_profile_enabled = None + self._memory_nudge_interval = None + self._skill_nudge_interval = None + self.suppress_status_output = None + + def run_conversation(self, *args, **kwargs): + raise RuntimeError("stop after recording state — don't actually call the API") + + def shutdown_memory_provider(self): + pass + + def close(self): + pass + + +def test_review_fork_inherits_parent_cached_system_prompt(): + """The review fork's _cached_system_prompt must equal the parent's byte-for-byte. + + Anthropic's prefix cache keys on exact bytes; any divergence (timestamp + minute tick, fresh session_id, narrower skills_prompt) shifts the key + and forces a full re-cache. Inheriting the parent's cached prompt is + the cheap, mechanical fix. + """ + import run_agent + + agent = _make_agent_stub(run_agent.AIAgent) + + captured = {} + parent_prompt = agent._cached_system_prompt + + # Hook the assignment site: record what gets put on the review agent. + real_recorder_init = _ReviewAgentRecorder.__init__ + + def _recorder_init(self, *args, **kwargs): + real_recorder_init(self, *args, **kwargs) + # The actual production code assigns _cached_system_prompt AFTER __init__, + # so we need to capture it on attribute set. Use a property-style sentinel + # via __setattr__ on this instance. + + with patch.object(run_agent, "AIAgent", _ReviewAgentRecorder), \ + patch("threading.Thread", _SyncThread): + # Wrap the recorder's __setattr__ so we can see the _cached_system_prompt + # write that _spawn_background_review performs after construction. + orig_setattr = _ReviewAgentRecorder.__setattr__ + + def _spy_setattr(self, name, value): + if name == "_cached_system_prompt": + captured["written_prompt"] = value + orig_setattr(self, name, value) + + with patch.object(_ReviewAgentRecorder, "__setattr__", _spy_setattr): + agent._spawn_background_review( + messages_snapshot=[], + review_memory=True, + review_skills=False, + ) + + assert "written_prompt" in captured, ( + "_spawn_background_review never assigned _cached_system_prompt on the review agent" + ) + assert captured["written_prompt"] == parent_prompt, ( + f"Review fork's _cached_system_prompt diverged from parent's. " + f"Got {captured['written_prompt']!r}, expected {parent_prompt!r}. " + "This breaks Anthropic/OpenRouter prefix-cache parity (#25322)." + ) + + +def test_review_fork_pins_session_start_and_session_id(): + """Defensive complement to cached-system-prompt inheritance. + + Even though ``_cached_system_prompt`` inheritance short-circuits the + normal rebuild path, pinning ``session_start`` and ``session_id`` to + the parent's guarantees byte-identical output from any code path that + re-renders parts of the system prompt (compression, plugin hooks). + """ + import run_agent + + agent = _make_agent_stub(run_agent.AIAgent) + + captured = {} + + class _Recorder: + def __init__(self, *args, **kwargs): + self._cached_system_prompt = None + self._memory_write_origin = None + self._memory_write_context = None + self._memory_store = None + self._memory_enabled = None + self._user_profile_enabled = None + self._memory_nudge_interval = None + self._skill_nudge_interval = None + self.suppress_status_output = None + self.session_start = None + self.session_id = None + + def run_conversation(self, *args, **kwargs): + captured["session_start"] = self.session_start + captured["session_id"] = self.session_id + raise RuntimeError("stop after recording") + + def shutdown_memory_provider(self): + pass + + def close(self): + pass + + with patch.object(run_agent, "AIAgent", _Recorder), \ + patch("threading.Thread", _SyncThread): + agent._spawn_background_review( + messages_snapshot=[], + review_memory=True, + review_skills=False, + ) + + assert captured.get("session_start") == agent.session_start, ( + "Review fork did not inherit parent's session_start — " + "system-prompt rebuild paths would diverge." + ) + assert captured.get("session_id") == agent.session_id, ( + "Review fork did not inherit parent's session_id — " + "system-prompt rebuild paths would diverge." + ) diff --git a/tests/run_agent/test_background_review_toolset_restriction.py b/tests/run_agent/test_background_review_toolset_restriction.py index d1193dc6f91..7eea665b86f 100644 --- a/tests/run_agent/test_background_review_toolset_restriction.py +++ b/tests/run_agent/test_background_review_toolset_restriction.py @@ -1,8 +1,16 @@ -"""Tests that the background review agent is restricted to memory+skills toolsets. +"""Tests that the background review agent restricts tools at runtime, not at schema time. -Regression coverage for issue #15204: the background skill-review agent -inherited the full default toolset, allowing it to perform non-skill side -effects (terminal, send_message, delegate_task, etc.). +Regression coverage for issue #15204 (the background skill-review agent must +not perform non-skill side effects like terminal, send_message, delegate_task) +combined with issue #25322 / PR #17276 (the review fork must hit the parent's +Anthropic/OpenRouter prefix cache). + +Reconciling the two: the fork now inherits the parent's full ``tools`` schema +so the cache-key matches, and enforces the memory+skills restriction at +runtime via a thread-local whitelist on the existing +``get_pre_tool_call_block_message`` gate. Safety is preserved mechanically +(any non-whitelisted dispatch is blocked) without the schema-level narrowing +that caused the prefix-cache miss. """ import threading @@ -24,6 +32,9 @@ def _make_agent_stub(agent_cls): agent._skill_nudge_interval = 5 agent.background_review_callback = None agent.status_callback = None + agent._cached_system_prompt = None + import datetime as _dt + agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0) agent._MEMORY_REVIEW_PROMPT = "review memory" agent._SKILL_REVIEW_PROMPT = "review skills" agent._COMBINED_REVIEW_PROMPT = "review both" @@ -41,15 +52,20 @@ class _SyncThread: self._target() -def test_background_review_agent_uses_restricted_toolsets(): - """The review agent must only have access to 'memory' and 'skills' toolsets.""" +def test_background_review_does_not_narrow_toolset_schema(): + """The review fork must NOT pass enabled_toolsets to AIAgent. + + Narrowing the schema diverges the ``tools`` cache key from the parent's, + which sits above ``system`` in Anthropic's cache hierarchy and forces a + full prefix-cache miss on every review (see #25322, PR #17276). + """ import run_agent agent = _make_agent_stub(run_agent.AIAgent) captured = {} def _capture_init(self, *args, **kwargs): - captured["enabled_toolsets"] = kwargs.get("enabled_toolsets") + captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET") raise RuntimeError("stop after capturing init args") with patch.object(run_agent.AIAgent, "__init__", _capture_init), \ @@ -61,11 +77,71 @@ def test_background_review_agent_uses_restricted_toolsets(): ) assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called" - assert sorted(captured["enabled_toolsets"]) == ["memory", "skills"] + # The kwarg must be absent — letting AIAgent inherit the default full + # toolset so the schema bytes match the parent's. + assert captured["enabled_toolsets"] == "UNSET", ( + f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), " + "which breaks prefix-cache parity with the parent." + ) + + +def test_background_review_installs_thread_local_whitelist(): + """The review fork must install a memory/skills-only thread-local whitelist. + + The schema-level toolset narrowing was lifted (for prefix-cache parity), + so #15204's safety contract now relies on the runtime whitelist gate to + deny terminal/send_message/delegate_task at dispatch time. Verify the + whitelist is set with exactly the memory+skills tool names. + """ + import run_agent + from hermes_cli import plugins as _plugins + + captured = {} + + def _capture_whitelist(whitelist, deny_msg_fmt=None): + captured["whitelist"] = set(whitelist) + captured["deny_msg_fmt"] = deny_msg_fmt + # Stop here — we just want to see what gets installed. + raise RuntimeError("stop after capturing whitelist") + + agent = _make_agent_stub(run_agent.AIAgent) + + def _no_init(self, *args, **kwargs): + # Don't crash AIAgent.__init__; let execution flow reach + # set_thread_tool_whitelist. + return None + + with patch.object(run_agent.AIAgent, "__init__", _no_init), \ + patch.object(_plugins, "set_thread_tool_whitelist", _capture_whitelist), \ + patch("threading.Thread", _SyncThread): + agent._spawn_background_review( + messages_snapshot=[], + review_memory=True, + review_skills=False, + ) + + assert "whitelist" in captured, "set_thread_tool_whitelist was not called" + whitelist = captured["whitelist"] + # memory + skills tools must be allowed + assert "memory" in whitelist + assert "skill_manage" in whitelist + assert "skill_view" in whitelist + assert "skills_list" in whitelist + # dangerous tools must NOT be in the whitelist + assert "terminal" not in whitelist + assert "send_message" not in whitelist + assert "delegate_task" not in whitelist + assert "web_search" not in whitelist + assert "execute_code" not in whitelist def test_background_review_agent_tools_are_limited(): - """Verify the resolved memory+skills toolsets only contain memory and skill tools.""" + """Verify the resolved memory+skills toolsets only contain memory and skill tools. + + Sanity check on the source of truth for what the runtime whitelist is + derived from — if a future PR adds e.g. `terminal` to the `memory` + toolset, the review-fork safety contract silently breaks. + """ from toolsets import resolve_multiple_toolsets expected_tools = set(resolve_multiple_toolsets(["memory", "skills"]))