mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
test(memory): cover cache-parity + runtime whitelist on background review fork
- test_background_review_does_not_narrow_toolset_schema: review fork must NOT pass enabled_toolsets to AIAgent (full parent schema = matching Anthropic cache key on the 'tools' field). - test_background_review_installs_thread_local_whitelist: the runtime whitelist that replaces schema-level narrowing must contain memory + skills tools and exclude terminal / send_message / delegate_task / web_search / execute_code. - test_review_fork_inherits_parent_cached_system_prompt: new test for PR #17276's first root cause — the fork's _cached_system_prompt must equal the parent's byte-for-byte. - test_review_fork_pins_session_start_and_session_id: defensive belt-and- suspenders for the cached-prompt inheritance. Inverted the original test_background_review_agent_uses_restricted_toolsets (which asserted the schema-level narrowing) — that narrowing was the direct cause of #25322's cache miss, and the runtime whitelist replaces its safety claim without breaking cache parity. Refs #25322, #15204, PR #17276.
This commit is contained in:
parent
07349ce4df
commit
8c6b0c9ecd
3 changed files with 273 additions and 9 deletions
|
|
@ -20,6 +20,9 @@ def _bare_agent() -> AIAgent:
|
||||||
agent._memory_store = object()
|
agent._memory_store = object()
|
||||||
agent._memory_enabled = True
|
agent._memory_enabled = True
|
||||||
agent._user_profile_enabled = False
|
agent._user_profile_enabled = False
|
||||||
|
agent._cached_system_prompt = "test-cached-system-prompt"
|
||||||
|
import datetime as _dt
|
||||||
|
agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
|
||||||
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
||||||
agent._SKILL_REVIEW_PROMPT = "review skills"
|
agent._SKILL_REVIEW_PROMPT = "review skills"
|
||||||
agent._COMBINED_REVIEW_PROMPT = "review both"
|
agent._COMBINED_REVIEW_PROMPT = "review both"
|
||||||
|
|
|
||||||
185
tests/run_agent/test_background_review_cache_parity.py
Normal file
185
tests/run_agent/test_background_review_cache_parity.py
Normal file
|
|
@ -0,0 +1,185 @@
|
||||||
|
"""Tests that the background review fork inherits the parent's cached system prompt.
|
||||||
|
|
||||||
|
Regression coverage for issue #25322 (and PR #17276's first root cause): the
|
||||||
|
background review's outbound HTTP request must carry the same system bytes as
|
||||||
|
the parent's so Anthropic/OpenRouter's exact-prefix cache key matches.
|
||||||
|
|
||||||
|
Without this, every review rebuilds the system prompt from scratch — fresh
|
||||||
|
``_hermes_now()`` timestamp, fresh ``session_id``, and a different skills
|
||||||
|
prompt under the (former) narrow toolset — and the prefix-cache miss costs
|
||||||
|
roughly the full uncached system-prompt cost per nudge (~26% end-to-end on
|
||||||
|
Sonnet 4.5 per the contributor's measurement).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
|
def _make_agent_stub(agent_cls):
|
||||||
|
"""Create a minimal AIAgent-like object with just enough state for _spawn_background_review."""
|
||||||
|
agent = object.__new__(agent_cls)
|
||||||
|
agent.model = "test-model"
|
||||||
|
agent.platform = "test"
|
||||||
|
agent.provider = "openai"
|
||||||
|
agent.session_id = "sess-123"
|
||||||
|
agent.quiet_mode = True
|
||||||
|
agent._memory_store = None
|
||||||
|
agent._memory_enabled = True
|
||||||
|
agent._user_profile_enabled = False
|
||||||
|
agent._memory_nudge_interval = 5
|
||||||
|
agent._skill_nudge_interval = 5
|
||||||
|
agent.background_review_callback = None
|
||||||
|
agent.status_callback = None
|
||||||
|
agent._cached_system_prompt = (
|
||||||
|
"PARENT-SYSTEM-PROMPT-BYTES — must be inherited verbatim "
|
||||||
|
"for prefix-cache parity"
|
||||||
|
)
|
||||||
|
import datetime as _dt
|
||||||
|
agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
|
||||||
|
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
||||||
|
agent._SKILL_REVIEW_PROMPT = "review skills"
|
||||||
|
agent._COMBINED_REVIEW_PROMPT = "review both"
|
||||||
|
return agent
|
||||||
|
|
||||||
|
|
||||||
|
class _SyncThread:
|
||||||
|
"""Drop-in replacement for threading.Thread that runs the target inline."""
|
||||||
|
|
||||||
|
def __init__(self, *, target=None, daemon=None, name=None):
|
||||||
|
self._target = target
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
if self._target:
|
||||||
|
self._target()
|
||||||
|
|
||||||
|
|
||||||
|
class _ReviewAgentRecorder:
|
||||||
|
"""Stand-in for the review-fork AIAgent that records the prompt assignment."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._cached_system_prompt = None
|
||||||
|
self._memory_write_origin = None
|
||||||
|
self._memory_write_context = None
|
||||||
|
self._memory_store = None
|
||||||
|
self._memory_enabled = None
|
||||||
|
self._user_profile_enabled = None
|
||||||
|
self._memory_nudge_interval = None
|
||||||
|
self._skill_nudge_interval = None
|
||||||
|
self.suppress_status_output = None
|
||||||
|
|
||||||
|
def run_conversation(self, *args, **kwargs):
|
||||||
|
raise RuntimeError("stop after recording state — don't actually call the API")
|
||||||
|
|
||||||
|
def shutdown_memory_provider(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_review_fork_inherits_parent_cached_system_prompt():
|
||||||
|
"""The review fork's _cached_system_prompt must equal the parent's byte-for-byte.
|
||||||
|
|
||||||
|
Anthropic's prefix cache keys on exact bytes; any divergence (timestamp
|
||||||
|
minute tick, fresh session_id, narrower skills_prompt) shifts the key
|
||||||
|
and forces a full re-cache. Inheriting the parent's cached prompt is
|
||||||
|
the cheap, mechanical fix.
|
||||||
|
"""
|
||||||
|
import run_agent
|
||||||
|
|
||||||
|
agent = _make_agent_stub(run_agent.AIAgent)
|
||||||
|
|
||||||
|
captured = {}
|
||||||
|
parent_prompt = agent._cached_system_prompt
|
||||||
|
|
||||||
|
# Hook the assignment site: record what gets put on the review agent.
|
||||||
|
real_recorder_init = _ReviewAgentRecorder.__init__
|
||||||
|
|
||||||
|
def _recorder_init(self, *args, **kwargs):
|
||||||
|
real_recorder_init(self, *args, **kwargs)
|
||||||
|
# The actual production code assigns _cached_system_prompt AFTER __init__,
|
||||||
|
# so we need to capture it on attribute set. Use a property-style sentinel
|
||||||
|
# via __setattr__ on this instance.
|
||||||
|
|
||||||
|
with patch.object(run_agent, "AIAgent", _ReviewAgentRecorder), \
|
||||||
|
patch("threading.Thread", _SyncThread):
|
||||||
|
# Wrap the recorder's __setattr__ so we can see the _cached_system_prompt
|
||||||
|
# write that _spawn_background_review performs after construction.
|
||||||
|
orig_setattr = _ReviewAgentRecorder.__setattr__
|
||||||
|
|
||||||
|
def _spy_setattr(self, name, value):
|
||||||
|
if name == "_cached_system_prompt":
|
||||||
|
captured["written_prompt"] = value
|
||||||
|
orig_setattr(self, name, value)
|
||||||
|
|
||||||
|
with patch.object(_ReviewAgentRecorder, "__setattr__", _spy_setattr):
|
||||||
|
agent._spawn_background_review(
|
||||||
|
messages_snapshot=[],
|
||||||
|
review_memory=True,
|
||||||
|
review_skills=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "written_prompt" in captured, (
|
||||||
|
"_spawn_background_review never assigned _cached_system_prompt on the review agent"
|
||||||
|
)
|
||||||
|
assert captured["written_prompt"] == parent_prompt, (
|
||||||
|
f"Review fork's _cached_system_prompt diverged from parent's. "
|
||||||
|
f"Got {captured['written_prompt']!r}, expected {parent_prompt!r}. "
|
||||||
|
"This breaks Anthropic/OpenRouter prefix-cache parity (#25322)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_review_fork_pins_session_start_and_session_id():
|
||||||
|
"""Defensive complement to cached-system-prompt inheritance.
|
||||||
|
|
||||||
|
Even though ``_cached_system_prompt`` inheritance short-circuits the
|
||||||
|
normal rebuild path, pinning ``session_start`` and ``session_id`` to
|
||||||
|
the parent's guarantees byte-identical output from any code path that
|
||||||
|
re-renders parts of the system prompt (compression, plugin hooks).
|
||||||
|
"""
|
||||||
|
import run_agent
|
||||||
|
|
||||||
|
agent = _make_agent_stub(run_agent.AIAgent)
|
||||||
|
|
||||||
|
captured = {}
|
||||||
|
|
||||||
|
class _Recorder:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._cached_system_prompt = None
|
||||||
|
self._memory_write_origin = None
|
||||||
|
self._memory_write_context = None
|
||||||
|
self._memory_store = None
|
||||||
|
self._memory_enabled = None
|
||||||
|
self._user_profile_enabled = None
|
||||||
|
self._memory_nudge_interval = None
|
||||||
|
self._skill_nudge_interval = None
|
||||||
|
self.suppress_status_output = None
|
||||||
|
self.session_start = None
|
||||||
|
self.session_id = None
|
||||||
|
|
||||||
|
def run_conversation(self, *args, **kwargs):
|
||||||
|
captured["session_start"] = self.session_start
|
||||||
|
captured["session_id"] = self.session_id
|
||||||
|
raise RuntimeError("stop after recording")
|
||||||
|
|
||||||
|
def shutdown_memory_provider(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
with patch.object(run_agent, "AIAgent", _Recorder), \
|
||||||
|
patch("threading.Thread", _SyncThread):
|
||||||
|
agent._spawn_background_review(
|
||||||
|
messages_snapshot=[],
|
||||||
|
review_memory=True,
|
||||||
|
review_skills=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured.get("session_start") == agent.session_start, (
|
||||||
|
"Review fork did not inherit parent's session_start — "
|
||||||
|
"system-prompt rebuild paths would diverge."
|
||||||
|
)
|
||||||
|
assert captured.get("session_id") == agent.session_id, (
|
||||||
|
"Review fork did not inherit parent's session_id — "
|
||||||
|
"system-prompt rebuild paths would diverge."
|
||||||
|
)
|
||||||
|
|
@ -1,8 +1,16 @@
|
||||||
"""Tests that the background review agent is restricted to memory+skills toolsets.
|
"""Tests that the background review agent restricts tools at runtime, not at schema time.
|
||||||
|
|
||||||
Regression coverage for issue #15204: the background skill-review agent
|
Regression coverage for issue #15204 (the background skill-review agent must
|
||||||
inherited the full default toolset, allowing it to perform non-skill side
|
not perform non-skill side effects like terminal, send_message, delegate_task)
|
||||||
effects (terminal, send_message, delegate_task, etc.).
|
combined with issue #25322 / PR #17276 (the review fork must hit the parent's
|
||||||
|
Anthropic/OpenRouter prefix cache).
|
||||||
|
|
||||||
|
Reconciling the two: the fork now inherits the parent's full ``tools`` schema
|
||||||
|
so the cache-key matches, and enforces the memory+skills restriction at
|
||||||
|
runtime via a thread-local whitelist on the existing
|
||||||
|
``get_pre_tool_call_block_message`` gate. Safety is preserved mechanically
|
||||||
|
(any non-whitelisted dispatch is blocked) without the schema-level narrowing
|
||||||
|
that caused the prefix-cache miss.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
|
|
@ -24,6 +32,9 @@ def _make_agent_stub(agent_cls):
|
||||||
agent._skill_nudge_interval = 5
|
agent._skill_nudge_interval = 5
|
||||||
agent.background_review_callback = None
|
agent.background_review_callback = None
|
||||||
agent.status_callback = None
|
agent.status_callback = None
|
||||||
|
agent._cached_system_prompt = None
|
||||||
|
import datetime as _dt
|
||||||
|
agent.session_start = _dt.datetime(2026, 1, 1, 12, 0, 0)
|
||||||
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
agent._MEMORY_REVIEW_PROMPT = "review memory"
|
||||||
agent._SKILL_REVIEW_PROMPT = "review skills"
|
agent._SKILL_REVIEW_PROMPT = "review skills"
|
||||||
agent._COMBINED_REVIEW_PROMPT = "review both"
|
agent._COMBINED_REVIEW_PROMPT = "review both"
|
||||||
|
|
@ -41,15 +52,20 @@ class _SyncThread:
|
||||||
self._target()
|
self._target()
|
||||||
|
|
||||||
|
|
||||||
def test_background_review_agent_uses_restricted_toolsets():
|
def test_background_review_does_not_narrow_toolset_schema():
|
||||||
"""The review agent must only have access to 'memory' and 'skills' toolsets."""
|
"""The review fork must NOT pass enabled_toolsets to AIAgent.
|
||||||
|
|
||||||
|
Narrowing the schema diverges the ``tools`` cache key from the parent's,
|
||||||
|
which sits above ``system`` in Anthropic's cache hierarchy and forces a
|
||||||
|
full prefix-cache miss on every review (see #25322, PR #17276).
|
||||||
|
"""
|
||||||
import run_agent
|
import run_agent
|
||||||
|
|
||||||
agent = _make_agent_stub(run_agent.AIAgent)
|
agent = _make_agent_stub(run_agent.AIAgent)
|
||||||
captured = {}
|
captured = {}
|
||||||
|
|
||||||
def _capture_init(self, *args, **kwargs):
|
def _capture_init(self, *args, **kwargs):
|
||||||
captured["enabled_toolsets"] = kwargs.get("enabled_toolsets")
|
captured["enabled_toolsets"] = kwargs.get("enabled_toolsets", "UNSET")
|
||||||
raise RuntimeError("stop after capturing init args")
|
raise RuntimeError("stop after capturing init args")
|
||||||
|
|
||||||
with patch.object(run_agent.AIAgent, "__init__", _capture_init), \
|
with patch.object(run_agent.AIAgent, "__init__", _capture_init), \
|
||||||
|
|
@ -61,11 +77,71 @@ def test_background_review_agent_uses_restricted_toolsets():
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called"
|
assert "enabled_toolsets" in captured, "AIAgent.__init__ was not called"
|
||||||
assert sorted(captured["enabled_toolsets"]) == ["memory", "skills"]
|
# The kwarg must be absent — letting AIAgent inherit the default full
|
||||||
|
# toolset so the schema bytes match the parent's.
|
||||||
|
assert captured["enabled_toolsets"] == "UNSET", (
|
||||||
|
f"Review fork narrowed the toolset schema (got {captured['enabled_toolsets']!r}), "
|
||||||
|
"which breaks prefix-cache parity with the parent."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_background_review_installs_thread_local_whitelist():
|
||||||
|
"""The review fork must install a memory/skills-only thread-local whitelist.
|
||||||
|
|
||||||
|
The schema-level toolset narrowing was lifted (for prefix-cache parity),
|
||||||
|
so #15204's safety contract now relies on the runtime whitelist gate to
|
||||||
|
deny terminal/send_message/delegate_task at dispatch time. Verify the
|
||||||
|
whitelist is set with exactly the memory+skills tool names.
|
||||||
|
"""
|
||||||
|
import run_agent
|
||||||
|
from hermes_cli import plugins as _plugins
|
||||||
|
|
||||||
|
captured = {}
|
||||||
|
|
||||||
|
def _capture_whitelist(whitelist, deny_msg_fmt=None):
|
||||||
|
captured["whitelist"] = set(whitelist)
|
||||||
|
captured["deny_msg_fmt"] = deny_msg_fmt
|
||||||
|
# Stop here — we just want to see what gets installed.
|
||||||
|
raise RuntimeError("stop after capturing whitelist")
|
||||||
|
|
||||||
|
agent = _make_agent_stub(run_agent.AIAgent)
|
||||||
|
|
||||||
|
def _no_init(self, *args, **kwargs):
|
||||||
|
# Don't crash AIAgent.__init__; let execution flow reach
|
||||||
|
# set_thread_tool_whitelist.
|
||||||
|
return None
|
||||||
|
|
||||||
|
with patch.object(run_agent.AIAgent, "__init__", _no_init), \
|
||||||
|
patch.object(_plugins, "set_thread_tool_whitelist", _capture_whitelist), \
|
||||||
|
patch("threading.Thread", _SyncThread):
|
||||||
|
agent._spawn_background_review(
|
||||||
|
messages_snapshot=[],
|
||||||
|
review_memory=True,
|
||||||
|
review_skills=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "whitelist" in captured, "set_thread_tool_whitelist was not called"
|
||||||
|
whitelist = captured["whitelist"]
|
||||||
|
# memory + skills tools must be allowed
|
||||||
|
assert "memory" in whitelist
|
||||||
|
assert "skill_manage" in whitelist
|
||||||
|
assert "skill_view" in whitelist
|
||||||
|
assert "skills_list" in whitelist
|
||||||
|
# dangerous tools must NOT be in the whitelist
|
||||||
|
assert "terminal" not in whitelist
|
||||||
|
assert "send_message" not in whitelist
|
||||||
|
assert "delegate_task" not in whitelist
|
||||||
|
assert "web_search" not in whitelist
|
||||||
|
assert "execute_code" not in whitelist
|
||||||
|
|
||||||
|
|
||||||
def test_background_review_agent_tools_are_limited():
|
def test_background_review_agent_tools_are_limited():
|
||||||
"""Verify the resolved memory+skills toolsets only contain memory and skill tools."""
|
"""Verify the resolved memory+skills toolsets only contain memory and skill tools.
|
||||||
|
|
||||||
|
Sanity check on the source of truth for what the runtime whitelist is
|
||||||
|
derived from — if a future PR adds e.g. `terminal` to the `memory`
|
||||||
|
toolset, the review-fork safety contract silently breaks.
|
||||||
|
"""
|
||||||
from toolsets import resolve_multiple_toolsets
|
from toolsets import resolve_multiple_toolsets
|
||||||
|
|
||||||
expected_tools = set(resolve_multiple_toolsets(["memory", "skills"]))
|
expected_tools = set(resolve_multiple_toolsets(["memory", "skills"]))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue