fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)

The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
2026-05-24 05:41:40 +00:00 · 2026-05-12 20:46:04 -07:00 · 2026-05-12 20:46:04 -07:00 · b06e999302
commit b06e999302
parent 80374d4dd9
8 changed files with 41 additions and 714 deletions
--- a/tests/run_agent/test_anthropic_prompt_cache_policy.py
+++ b/tests/run_agent/test_anthropic_prompt_cache_policy.py
@ -330,134 +330,3 @@ class TestExplicitOverrides:
 # Long-lived prefix cache policy (cross-session 1h tier)
 # ─────────────────────────────────────────────────────────────────────

-class TestSupportsLongLivedAnthropicCache:
-    """Narrower than _anthropic_prompt_cache_policy — only Claude on the 4
-    explicitly-validated endpoints get the long-lived layout."""
-
-    def test_native_anthropic_claude_supported(self):
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_anthropic_oauth_supported(self):
-        # OAuth uses the same transport as native Anthropic
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-opus-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_openrouter_claude_supported(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_claude_supported(self):
-        # Nous Portal proxies to OpenRouter — same wire format
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-opus-4.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_qwen_NOT_long_lived(self):
-        # Portal Qwen still gets cache_control markers via the standard
-        # system_and_3 5m layout (see _anthropic_prompt_cache_policy
-        # tests above), but it must NOT ride the prefix_and_2 1h layout.
-        # Alibaba DashScope (the upstream for every Qwen route, incl.
-        # Portal -> OpenRouter -> Alibaba) only supports a single
-        # ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently
-        # ignored, so the high-value tools[-1] + system-prefix
-        # breakpoints don't land. Stay on system_and_3 instead.
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self):
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen/qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_nous_portal_non_claude_rejected(self):
-        # Portal long-lived cache scope is now Claude-only. Qwen
-        # rejection is covered by the dedicated tests above; this
-        # covers everything else (gpt, etc.).
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_openrouter_non_claude_rejected(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_third_party_anthropic_gateway_rejected(self):
-        # MiniMax / Kimi / etc. — anthropic-wire but not in our validated list
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_alibaba_dashscope_rejected(self):
-        agent = _make_agent(
-            provider="alibaba",
-            base_url="https://dashscope.aliyuncs.com/api/v1/anthropic",
-            api_mode="anthropic_messages",
-            model="qwen3.5-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_opencode_qwen_rejected(self):
-        agent = _make_agent(
-            provider="opencode-go",
-            base_url="https://api.opencode-go.example/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_fallback_target_evaluated_independently(self):
-        # Starting on a non-supported provider, falling back to OpenRouter Claude
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        ) is True