"""Tests that the background review agent restricts tools at runtime, not at schema time. Regression coverage for issue #15204 (the background skill-review agent must not perform non-skill side effects like terminal, send_message, delegate_task) combined with issue #25322 / PR #17276 (the review fork must hit the parent's Anthropic/OpenRouter prefix cache). Reconciling the two: the fork now inherits the parent's full ``tools`` schema so the cache-key matches, and enforces the memory+skills restriction at runtime via a thread-local whitelist on the existing ``get_pre_tool_call_block_message`` gate. Safety is preserved mechanically (any non-whitelisted dispatch is blocked) without the schema-level narrowing that caused the prefix-cache miss. """ import threading from unittest.mock import patch def _make_agent_stub(agent_cls): """Create a minimal AIAgent-like object with just enough state for _spawn_background_review.""" agent = object.__new__(agent_cls) agent.model = "test-model" agent.platform = "test" agent.provider = "openai" agent.session_id = "sess-123" agent.quiet_mode = True agent._memory_store = None agent._memory_enabled = True agent._user_profile_enabled = False agent._memory_nudge_interval = 5 agent._skill_nudge_interval = 5 agent.background_review_callback = None agent.status_callback = None agent._cached_system_prompt = None import datetime as _dt agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0) agent._MEMORY_REVIEW_PROMPT = "review memory" agent._SKILL_REVIEW_PROMPT = "review skills" agent._COMBINED_REVIEW_PROMPT = "review both" return agent class _SyncThread: """Drop-in replacement for threading.Thread that runs the target inline.""" def __init__(self, *, target=None, daemon=None, name=None): self._target = target def start(self): if self._target: self._target() def test_background_review_does_not_narrow_toolset_schema(): """The review fork must NOT pass enabled_toolsets to AIAgent. Narrowing the schema diverges the ``tools`` cache key from the parent's, which sits above ``system`` in Anthropic's cache hierarchy and forces a full prefix-cache miss on every review (see #25322, PR #17276). """ import run_agent agent = _make_agent_stub(run_agent.AIAgent) captured = {} def _capture_init(self, *args, **kwargs): captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET") raise RuntimeError("stop after capturing init args") with patch.object(run_agent.AIAgent, "__init__", _capture_init), \ patch("threading.Thread", _SyncThread): agent._spawn_background_review( messages_snapshot=[], review_memory=True, review_skills=False, ) assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called" # The kwarg must be absent — letting AIAgent inherit the default full # toolset so the schema bytes match the parent's. assert captured["enabled_toolsets"] == "UNSET", ( f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), " "which breaks prefix-cache parity with the parent." ) def test_background_review_installs_thread_local_whitelist(): """The review fork must install a memory/skills-only thread-local whitelist. The schema-level toolset narrowing was lifted (for prefix-cache parity), so #15204's safety contract now relies on the runtime whitelist gate to deny terminal/send_message/delegate_task at dispatch time. Verify the whitelist is set with exactly the memory+skills tool names. """ import run_agent from hermes_cli import plugins as _plugins captured = {} def _capture_whitelist(whitelist, deny_msg_fmt=None): captured["whitelist"] = set(whitelist) captured["deny_msg_fmt"] = deny_msg_fmt # Stop here — we just want to see what gets installed. raise RuntimeError("stop after capturing whitelist") agent = _make_agent_stub(run_agent.AIAgent) def _no_init(self, *args, **kwargs): # Don't crash AIAgent.__init__; let execution flow reach # set_thread_tool_whitelist. return None with patch.object(run_agent.AIAgent, "__init__", _no_init), \ patch.object(_plugins, "set_thread_tool_whitelist", _capture_whitelist), \ patch("threading.Thread", _SyncThread): agent._spawn_background_review( messages_snapshot=[], review_memory=True, review_skills=False, ) assert "whitelist" in captured, "set_thread_tool_whitelist was not called" whitelist = captured["whitelist"] # memory + skills tools must be allowed assert "memory" in whitelist assert "skill_manage" in whitelist assert "skill_view" in whitelist assert "skills_list" in whitelist # dangerous tools must NOT be in the whitelist assert "terminal" not in whitelist assert "send_message" not in whitelist assert "delegate_task" not in whitelist assert "web_search" not in whitelist assert "execute_code" not in whitelist def test_background_review_agent_tools_are_limited(): """Verify the resolved memory+skills toolsets only contain memory and skill tools. Sanity check on the source of truth for what the runtime whitelist is derived from — if a future PR adds e.g. `terminal` to the `memory` toolset, the review-fork safety contract silently breaks. """ from toolsets import resolve_multiple_toolsets expected_tools = set(resolve_multiple_toolsets(["memory", "skills"])) assert "memory" in expected_tools assert "skill_manage" in expected_tools assert "skill_view" in expected_tools assert "skills_list" in expected_tools assert "terminal" not in expected_tools assert "send_message" not in expected_tools assert "delegate_task" not in expected_tools assert "web_search" not in expected_tools assert "execute_code" not in expected_tools