From 5fe0672260e65a6ff664f5905eb69a6fca674707 Mon Sep 17 00:00:00 2001 From: WorldWriter <30366221+WorldWriter@users.noreply.github.com> Date: Wed, 29 Apr 2026 13:05:23 +0800 Subject: [PATCH] fix(memory): hit prefix cache in background review fork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background review fork is supposed to hit Anthropic's prefix cache on the parent's messages_snapshot, but currently doesn't (cache_read=0 on every fork). Two root causes, fixed in this commit: 1. System prompt is rebuilt at fork time. _cached_system_prompt starts as None, so run_conversation calls _build_system_prompt, which embeds a minute-precision "Conversation started: ..." timestamp. Reviews fire 10+ turns after session start, so the minute differs from main's, producing a 1-character diff that invalidates the byte-exact cache key. Fix: inherit the parent's _cached_system_prompt directly (same idea as #17089, which was self-closed for only fixing this half). 2. Tools schema was narrowed via enabled_toolsets=["memory","skills"] for safety. Anthropic's cache key includes `tools`, which sits before `system` in the cache hierarchy, so even byte-identical `system` won't hit when `tools` differs from main's full set. Fix: drop the schema-level restriction so `tools` matches main, and deny non-whitelisted tools at runtime via the existing get_pre_tool_call_block_message gate (hermes_cli/plugins.py:1085, already called at all three dispatch sites). Install/clear a thread- local whitelist (added in the previous commit) on the daemon thread. Append a soft constraint to the review prompt so the model knows. Real E2E on Sonnet 4.5 (12-tool task + auto-triggered review): - Per review-call cost: $0.331 → $0.035 (~89% reduction) - End-to-end per run: $0.848 → $0.629 (~26% reduction) - Review fork cache_create / cache_read: 88,385 / 0 → 1,234 / 94,404 Co-Authored-By: Claude Opus 4.7 (1M context) --- run_agent.py | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/run_agent.py b/run_agent.py index f9eaee85af6..ecaceaa78d7 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4289,7 +4289,6 @@ class AIAgent: api_key=_parent_runtime.get("api_key") or None, credential_pool=getattr(self, "_credential_pool", None), parent_session_id=self.session_id, - enabled_toolsets=["memory", "skills"], ) review_agent._memory_write_origin = "background_review" review_agent._memory_write_context = "background_review" @@ -4306,12 +4305,51 @@ class AIAgent: # _vprint and leak past the stdout redirect (they go via # _print_fn/status_callback, which bypass sys.stdout). review_agent.suppress_status_output = True + # Inherit the parent's cached system prompt verbatim so + # the review fork's outbound HTTP request hits the same + # Anthropic/OpenRouter prefix cache the parent warmed. + # Without this, the fork rebuilds the system prompt from + # scratch (fresh _hermes_now() timestamp, fresh + # session_id, narrower toolset → different skills_prompt) + # and the byte-exact prefix-cache key misses. See + # issue #25322 and PR #17276 for the full analysis + + # measured impact (~26% end-to-end cost reduction on + # Sonnet 4.5). + review_agent._cached_system_prompt = self._cached_system_prompt - review_agent.run_conversation( - user_message=prompt, - conversation_history=messages_snapshot, + from model_tools import get_tool_definitions + from hermes_cli.plugins import ( + set_thread_tool_whitelist, + clear_thread_tool_whitelist, ) + review_whitelist = { + t["function"]["name"] + for t in get_tool_definitions( + enabled_toolsets=["memory", "skills"], + quiet_mode=True, + ) + } + set_thread_tool_whitelist( + review_whitelist, + deny_msg_fmt=( + "Background review denied non-whitelisted tool: " + "{tool_name}. Only memory/skill tools are allowed." + ), + ) + try: + review_agent.run_conversation( + user_message=( + prompt + + "\n\nYou can only call memory and skill " + "management tools. Other tools will be denied " + "at runtime — do not attempt them." + ), + conversation_history=messages_snapshot, + ) + finally: + clear_thread_tool_whitelist() + # Scan the review agent's messages for successful tool actions # and surface a compact summary to the user. Tool messages # already present in messages_snapshot must be skipped, since