From 8c6b0c9ecdabd67cb22b34e5c294e3f0aba47bbc Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Wed, 13 May 2026 22:06:31 -0700
Subject: [PATCH] test(memory): cover cache-parity + runtime whitelist on
 background review fork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- test_background_review_does_not_narrow_toolset_schema: review fork must
  NOT pass enabled_toolsets to AIAgent (full parent schema = matching
  Anthropic cache key on the 'tools' field).
- test_background_review_installs_thread_local_whitelist: the runtime
  whitelist that replaces schema-level narrowing must contain memory +
  skills tools and exclude terminal / send_message / delegate_task /
  web_search / execute_code.
- test_review_fork_inherits_parent_cached_system_prompt: new test for
  PR #17276's first root cause — the fork's _cached_system_prompt must
  equal the parent's byte-for-byte.
- test_review_fork_pins_session_start_and_session_id: defensive belt-and-
  suspenders for the cached-prompt inheritance.

Inverted the original test_background_review_agent_uses_restricted_toolsets
(which asserted the schema-level narrowing) — that narrowing was the
direct cause of #25322's cache miss, and the runtime whitelist replaces
its safety claim without breaking cache parity.

Refs #25322, #15204, PR #17276.
---
 tests/run_agent/test_background_review.py     |   3 +
 .../test_background_review_cache_parity.py    | 185 ++++++++++++++++++
 ...t_background_review_toolset_restriction.py |  94 ++++++++-
 3 files changed, 273 insertions(+), 9 deletions(-)
 create mode 100644 tests/run_agent/test_background_review_cache_parity.py

diff --git a/tests/run_agent/test_background_review.py b/tests/run_agent/test_background_review.py
index 8f2a61b7504..2e79b10b346 100644
--- a/tests/run_agent/test_background_review.py
+++ b/tests/run_agent/test_background_review.py
@@ -20,6 +20,9 @@ def _bare_agent() -> AIAgent:
     agent._memory_store = object()
     agent._memory_enabled = True
     agent._user_profile_enabled = False
+    agent._cached_system_prompt = "test-cached-system-prompt"
+    import datetime as _dt
+    agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
     agent._MEMORY_REVIEW_PROMPT = "review memory"
     agent._SKILL_REVIEW_PROMPT = "review skills"
     agent._COMBINED_REVIEW_PROMPT = "review both"
diff --git a/tests/run_agent/test_background_review_cache_parity.py b/tests/run_agent/test_background_review_cache_parity.py
new file mode 100644
index 00000000000..ac91cf75f7a
--- /dev/null
+++ b/tests/run_agent/test_background_review_cache_parity.py
@@ -0,0 +1,185 @@
+"""Tests that the background review fork inherits the parent's cached system prompt.
+
+Regression coverage for issue #25322 (and PR #17276's first root cause): the
+background review's outbound HTTP request must carry the same system bytes as
+the parent's so Anthropic/OpenRouter's exact-prefix cache key matches.
+
+Without this, every review rebuilds the system prompt from scratch — fresh
+``_hermes_now()`` timestamp, fresh ``session_id``, and a different skills
+prompt under the (former) narrow toolset — and the prefix-cache miss costs
+roughly the full uncached system-prompt cost per nudge (~26% end-to-end on
+Sonnet 4.5 per the contributor's measurement).
+"""
+
+from unittest.mock import patch
+
+
+def _make_agent_stub(agent_cls):
+    """Create a minimal AIAgent-like object with just enough state for _spawn_background_review."""
+    agent = object.__new__(agent_cls)
+    agent.model = "test-model"
+    agent.platform = "test"
+    agent.provider = "openai"
+    agent.session_id = "sess-123"
+    agent.quiet_mode = True
+    agent._memory_store = None
+    agent._memory_enabled = True
+    agent._user_profile_enabled = False
+    agent._memory_nudge_interval = 5
+    agent._skill_nudge_interval = 5
+    agent.background_review_callback = None
+    agent.status_callback = None
+    agent._cached_system_prompt = (
+        "PARENT-SYSTEM-PROMPT-BYTES — must be inherited verbatim "
+        "for prefix-cache parity"
+    )
+    import datetime as _dt
+    agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
+    agent._MEMORY_REVIEW_PROMPT = "review memory"
+    agent._SKILL_REVIEW_PROMPT = "review skills"
+    agent._COMBINED_REVIEW_PROMPT = "review both"
+    return agent
+
+
+class _SyncThread:
+    """Drop-in replacement for threading.Thread that runs the target inline."""
+
+    def __init__(self, *, target=None, daemon=None, name=None):
+        self._target = target
+
+    def start(self):
+        if self._target:
+            self._target()
+
+
+class _ReviewAgentRecorder:
+    """Stand-in for the review-fork AIAgent that records the prompt assignment."""
+
+    def __init__(self, *args, **kwargs):
+        self._cached_system_prompt = None
+        self._memory_write_origin = None
+        self._memory_write_context = None
+        self._memory_store = None
+        self._memory_enabled = None
+        self._user_profile_enabled = None
+        self._memory_nudge_interval = None
+        self._skill_nudge_interval = None
+        self.suppress_status_output = None
+
+    def run_conversation(self, *args, **kwargs):
+        raise RuntimeError("stop after recording state — don't actually call the API")
+
+    def shutdown_memory_provider(self):
+        pass
+
+    def close(self):
+        pass
+
+
+def test_review_fork_inherits_parent_cached_system_prompt():
+    """The review fork's _cached_system_prompt must equal the parent's byte-for-byte.
+
+    Anthropic's prefix cache keys on exact bytes; any divergence (timestamp
+    minute tick, fresh session_id, narrower skills_prompt) shifts the key
+    and forces a full re-cache. Inheriting the parent's cached prompt is
+    the cheap, mechanical fix.
+    """
+    import run_agent
+
+    agent = _make_agent_stub(run_agent.AIAgent)
+
+    captured = {}
+    parent_prompt = agent._cached_system_prompt
+
+    # Hook the assignment site: record what gets put on the review agent.
+    real_recorder_init = _ReviewAgentRecorder.__init__
+
+    def _recorder_init(self, *args, **kwargs):
+        real_recorder_init(self, *args, **kwargs)
+        # The actual production code assigns _cached_system_prompt AFTER __init__,
+        # so we need to capture it on attribute set. Use a property-style sentinel
+        # via __setattr__ on this instance.
+
+    with patch.object(run_agent, "AIAgent", _ReviewAgentRecorder), \
+         patch("threading.Thread", _SyncThread):
+        # Wrap the recorder's __setattr__ so we can see the _cached_system_prompt
+        # write that _spawn_background_review performs after construction.
+        orig_setattr = _ReviewAgentRecorder.__setattr__
+
+        def _spy_setattr(self, name, value):
+            if name == "_cached_system_prompt":
+                captured["written_prompt"] = value
+            orig_setattr(self, name, value)
+
+        with patch.object(_ReviewAgentRecorder, "__setattr__", _spy_setattr):
+            agent._spawn_background_review(
+                messages_snapshot=[],
+                review_memory=True,
+                review_skills=False,
+            )
+
+    assert "written_prompt" in captured, (
+        "_spawn_background_review never assigned _cached_system_prompt on the review agent"
+    )
+    assert captured["written_prompt"] == parent_prompt, (
+        f"Review fork's _cached_system_prompt diverged from parent's. "
+        f"Got {captured['written_prompt']!r}, expected {parent_prompt!r}. "
+        "This breaks Anthropic/OpenRouter prefix-cache parity (#25322)."
+    )
+
+
+def test_review_fork_pins_session_start_and_session_id():
+    """Defensive complement to cached-system-prompt inheritance.
+
+    Even though ``_cached_system_prompt`` inheritance short-circuits the
+    normal rebuild path, pinning ``session_start`` and ``session_id`` to
+    the parent's guarantees byte-identical output from any code path that
+    re-renders parts of the system prompt (compression, plugin hooks).
+    """
+    import run_agent
+
+    agent = _make_agent_stub(run_agent.AIAgent)
+
+    captured = {}
+
+    class _Recorder:
+        def __init__(self, *args, **kwargs):
+            self._cached_system_prompt = None
+            self._memory_write_origin = None
+            self._memory_write_context = None
+            self._memory_store = None
+            self._memory_enabled = None
+            self._user_profile_enabled = None
+            self._memory_nudge_interval = None
+            self._skill_nudge_interval = None
+            self.suppress_status_output = None
+            self.session_start = None
+            self.session_id = None
+
+        def run_conversation(self, *args, **kwargs):
+            captured["session_start"] = self.session_start
+            captured["session_id"] = self.session_id
+            raise RuntimeError("stop after recording")
+
+        def shutdown_memory_provider(self):
+            pass
+
+        def close(self):
+            pass
+
+    with patch.object(run_agent, "AIAgent", _Recorder), \
+         patch("threading.Thread", _SyncThread):
+        agent._spawn_background_review(
+            messages_snapshot=[],
+            review_memory=True,
+            review_skills=False,
+        )
+
+    assert captured.get("session_start") == agent.session_start, (
+        "Review fork did not inherit parent's session_start — "
+        "system-prompt rebuild paths would diverge."
+    )
+    assert captured.get("session_id") == agent.session_id, (
+        "Review fork did not inherit parent's session_id — "
+        "system-prompt rebuild paths would diverge."
+    )
diff --git a/tests/run_agent/test_background_review_toolset_restriction.py b/tests/run_agent/test_background_review_toolset_restriction.py
index d1193dc6f91..7eea665b86f 100644
--- a/tests/run_agent/test_background_review_toolset_restriction.py
+++ b/tests/run_agent/test_background_review_toolset_restriction.py
@@ -1,8 +1,16 @@
-"""Tests that the background review agent is restricted to memory+skills toolsets.
+"""Tests that the background review agent restricts tools at runtime, not at schema time.
 
-Regression coverage for issue #15204: the background skill-review agent
-inherited the full default toolset, allowing it to perform non-skill side
-effects (terminal, send_message, delegate_task, etc.).
+Regression coverage for issue #15204 (the background skill-review agent must
+not perform non-skill side effects like terminal, send_message, delegate_task)
+combined with issue #25322 / PR #17276 (the review fork must hit the parent's
+Anthropic/OpenRouter prefix cache).
+
+Reconciling the two: the fork now inherits the parent's full ``tools`` schema
+so the cache-key matches, and enforces the memory+skills restriction at
+runtime via a thread-local whitelist on the existing
+``get_pre_tool_call_block_message`` gate. Safety is preserved mechanically
+(any non-whitelisted dispatch is blocked) without the schema-level narrowing
+that caused the prefix-cache miss.
 """
 
 import threading
@@ -24,6 +32,9 @@ def _make_agent_stub(agent_cls):
     agent._skill_nudge_interval = 5
     agent.background_review_callback = None
     agent.status_callback = None
+    agent._cached_system_prompt = None
+    import datetime as _dt
+    agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
     agent._MEMORY_REVIEW_PROMPT = "review memory"
     agent._SKILL_REVIEW_PROMPT = "review skills"
     agent._COMBINED_REVIEW_PROMPT = "review both"
@@ -41,15 +52,20 @@ class _SyncThread:
             self._target()
 
 
-def test_background_review_agent_uses_restricted_toolsets():
-    """The review agent must only have access to 'memory' and 'skills' toolsets."""
+def test_background_review_does_not_narrow_toolset_schema():
+    """The review fork must NOT pass enabled_toolsets to AIAgent.
+
+    Narrowing the schema diverges the ``tools`` cache key from the parent's,
+    which sits above ``system`` in Anthropic's cache hierarchy and forces a
+    full prefix-cache miss on every review (see #25322, PR #17276).
+    """
     import run_agent
 
     agent = _make_agent_stub(run_agent.AIAgent)
     captured = {}
 
     def _capture_init(self, *args, **kwargs):
-        captured["enabled_toolsets"] = kwargs.get("enabled_toolsets")
+        captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET")
         raise RuntimeError("stop after capturing init args")
 
     with patch.object(run_agent.AIAgent, "__init__", _capture_init), \
@@ -61,11 +77,71 @@ def test_background_review_agent_uses_restricted_toolsets():
         )
 
     assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called"
-    assert sorted(captured["enabled_toolsets"]) == ["memory", "skills"]
+    # The kwarg must be absent — letting AIAgent inherit the default full
+    # toolset so the schema bytes match the parent's.
+    assert captured["enabled_toolsets"] == "UNSET", (
+        f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), "
+        "which breaks prefix-cache parity with the parent."
+    )
+
+
+def test_background_review_installs_thread_local_whitelist():
+    """The review fork must install a memory/skills-only thread-local whitelist.
+
+    The schema-level toolset narrowing was lifted (for prefix-cache parity),
+    so #15204's safety contract now relies on the runtime whitelist gate to
+    deny terminal/send_message/delegate_task at dispatch time. Verify the
+    whitelist is set with exactly the memory+skills tool names.
+    """
+    import run_agent
+    from hermes_cli import plugins as _plugins
+
+    captured = {}
+
+    def _capture_whitelist(whitelist, deny_msg_fmt=None):
+        captured["whitelist"] = set(whitelist)
+        captured["deny_msg_fmt"] = deny_msg_fmt
+        # Stop here — we just want to see what gets installed.
+        raise RuntimeError("stop after capturing whitelist")
+
+    agent = _make_agent_stub(run_agent.AIAgent)
+
+    def _no_init(self, *args, **kwargs):
+        # Don't crash AIAgent.__init__; let execution flow reach
+        # set_thread_tool_whitelist.
+        return None
+
+    with patch.object(run_agent.AIAgent, "__init__", _no_init), \
+         patch.object(_plugins, "set_thread_tool_whitelist", _capture_whitelist), \
+         patch("threading.Thread", _SyncThread):
+        agent._spawn_background_review(
+            messages_snapshot=[],
+            review_memory=True,
+            review_skills=False,
+        )
+
+    assert "whitelist" in captured, "set_thread_tool_whitelist was not called"
+    whitelist = captured["whitelist"]
+    # memory + skills tools must be allowed
+    assert "memory" in whitelist
+    assert "skill_manage" in whitelist
+    assert "skill_view" in whitelist
+    assert "skills_list" in whitelist
+    # dangerous tools must NOT be in the whitelist
+    assert "terminal" not in whitelist
+    assert "send_message" not in whitelist
+    assert "delegate_task" not in whitelist
+    assert "web_search" not in whitelist
+    assert "execute_code" not in whitelist
 
 
 def test_background_review_agent_tools_are_limited():
-    """Verify the resolved memory+skills toolsets only contain memory and skill tools."""
+    """Verify the resolved memory+skills toolsets only contain memory and skill tools.
+
+    Sanity check on the source of truth for what the runtime whitelist is
+    derived from — if a future PR adds e.g. `terminal` to the `memory`
+    toolset, the review-fork safety contract silently breaks.
+    """
     from toolsets import resolve_multiple_toolsets
 
     expected_tools = set(resolve_multiple_toolsets(["memory", "skills"]))