fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)

The long-lived prefix-cache layout split the system prompt into stable/
context/volatile blocks and re-derived them on every API call. The
volatile tier (timestamp + memory snapshot + USER profile) ticks per
turn, so the system message bytes mutated mid-conversation and broke
upstream prompt caches (OpenRouter, Nous Portal, Anthropic).

Diagnosed via live wire-format diffing: an 8-turn conversation showed
OLD layout flipping system block[1] sha mid-session at the minute
boundary, dropping cached_tokens to 0 on that turn (cumulative
66.6% vs 83.3% for the single-block layout). Hermes invariant:
history (system + all but the last 1-2 messages) must be static.

Fix: drop the long-lived layout entirely. Single layout everywhere —
system_and_3 with one cached system string built once on first turn,
replayed verbatim on every subsequent turn. Loses cross-session 1h
prefix caching for Claude (the feature that motivated the split), but
within-session caching now actually works on every provider.

Removed:
- run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl,
  _supports_long_lived_anthropic_cache method, the long-lived branch in
  run_conversation, mark_tools_for_long_lived_cache call site
- agent/prompt_caching.py: apply_anthropic_cache_control_long_lived,
  mark_tools_for_long_lived_cache, _mark_system_stable_block helper
- hermes_cli/config.py: prompt_caching.long_lived_prefix and
  prompt_caching.long_lived_ttl config keys
- tests/agent/test_prompt_caching_live.py (entire file)
- tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache,
  TestApplyAnthropicCacheControlLongLived
- tests/run_agent/test_anthropic_prompt_cache_policy.py:
  TestSupportsLongLivedAnthropicCache

Targeted tests: 62/62 pass.
This commit is contained in:
Teknium 2026-05-12 20:46:04 -07:00 committed by GitHub
parent 80374d4dd9
commit b06e999302
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 41 additions and 714 deletions

View file

@ -330,134 +330,3 @@ class TestExplicitOverrides:
# Long-lived prefix cache policy (cross-session 1h tier)
# ─────────────────────────────────────────────────────────────────────
class TestSupportsLongLivedAnthropicCache:
"""Narrower than _anthropic_prompt_cache_policy — only Claude on the 4
explicitly-validated endpoints get the long-lived layout."""
def test_native_anthropic_claude_supported(self):
agent = _make_agent(
provider="anthropic",
base_url="https://api.anthropic.com",
api_mode="anthropic_messages",
model="claude-sonnet-4.6",
)
assert agent._supports_long_lived_anthropic_cache() is True
def test_anthropic_oauth_supported(self):
# OAuth uses the same transport as native Anthropic
agent = _make_agent(
provider="anthropic",
base_url="https://api.anthropic.com",
api_mode="anthropic_messages",
model="claude-opus-4.6",
)
assert agent._supports_long_lived_anthropic_cache() is True
def test_openrouter_claude_supported(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="anthropic/claude-sonnet-4.6",
)
assert agent._supports_long_lived_anthropic_cache() is True
def test_nous_portal_claude_supported(self):
# Nous Portal proxies to OpenRouter — same wire format
agent = _make_agent(
provider="nous",
base_url="https://inference-api.nousresearch.com/v1",
api_mode="chat_completions",
model="anthropic/claude-opus-4.7",
)
assert agent._supports_long_lived_anthropic_cache() is True
def test_nous_portal_qwen_NOT_long_lived(self):
# Portal Qwen still gets cache_control markers via the standard
# system_and_3 5m layout (see _anthropic_prompt_cache_policy
# tests above), but it must NOT ride the prefix_and_2 1h layout.
# Alibaba DashScope (the upstream for every Qwen route, incl.
# Portal -> OpenRouter -> Alibaba) only supports a single
# ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently
# ignored, so the high-value tools[-1] + system-prefix
# breakpoints don't land. Stay on system_and_3 instead.
agent = _make_agent(
provider="nous",
base_url="https://inference-api.nousresearch.com/v1",
api_mode="chat_completions",
model="qwen3.6-plus",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self):
agent = _make_agent(
provider="nous",
base_url="https://inference-api.nousresearch.com/v1",
api_mode="chat_completions",
model="qwen/qwen3.6-plus",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_nous_portal_non_claude_rejected(self):
# Portal long-lived cache scope is now Claude-only. Qwen
# rejection is covered by the dedicated tests above; this
# covers everything else (gpt, etc.).
agent = _make_agent(
provider="nous",
base_url="https://inference-api.nousresearch.com/v1",
api_mode="chat_completions",
model="openai/gpt-5.4",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_openrouter_non_claude_rejected(self):
agent = _make_agent(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="openai/gpt-5.4",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_third_party_anthropic_gateway_rejected(self):
# MiniMax / Kimi / etc. — anthropic-wire but not in our validated list
agent = _make_agent(
provider="minimax",
base_url="https://api.minimax.io/anthropic",
api_mode="anthropic_messages",
model="minimax-m2.7",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_alibaba_dashscope_rejected(self):
agent = _make_agent(
provider="alibaba",
base_url="https://dashscope.aliyuncs.com/api/v1/anthropic",
api_mode="anthropic_messages",
model="qwen3.5-plus",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_opencode_qwen_rejected(self):
agent = _make_agent(
provider="opencode-go",
base_url="https://api.opencode-go.example/v1",
api_mode="chat_completions",
model="qwen3.6-plus",
)
assert agent._supports_long_lived_anthropic_cache() is False
def test_fallback_target_evaluated_independently(self):
# Starting on a non-supported provider, falling back to OpenRouter Claude
agent = _make_agent(
provider="minimax",
base_url="https://api.minimax.io/anthropic",
api_mode="anthropic_messages",
model="minimax-m2.7",
)
assert agent._supports_long_lived_anthropic_cache(
provider="openrouter",
base_url="https://openrouter.ai/api/v1",
api_mode="chat_completions",
model="anthropic/claude-sonnet-4.6",
) is True