mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)
The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
This commit is contained in:
parent
80374d4dd9
commit
b06e999302
8 changed files with 41 additions and 714 deletions
|
|
@ -330,134 +330,3 @@ class TestExplicitOverrides:
|
|||
# Long-lived prefix cache policy (cross-session 1h tier)
|
||||
# ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSupportsLongLivedAnthropicCache:
|
||||
"""Narrower than _anthropic_prompt_cache_policy — only Claude on the 4
|
||||
explicitly-validated endpoints get the long-lived layout."""
|
||||
|
||||
def test_native_anthropic_claude_supported(self):
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://api.anthropic.com",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-sonnet-4.6",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
|
||||
def test_anthropic_oauth_supported(self):
|
||||
# OAuth uses the same transport as native Anthropic
|
||||
agent = _make_agent(
|
||||
provider="anthropic",
|
||||
base_url="https://api.anthropic.com",
|
||||
api_mode="anthropic_messages",
|
||||
model="claude-opus-4.6",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
|
||||
def test_openrouter_claude_supported(self):
|
||||
agent = _make_agent(
|
||||
provider="openrouter",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_mode="chat_completions",
|
||||
model="anthropic/claude-sonnet-4.6",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
|
||||
def test_nous_portal_claude_supported(self):
|
||||
# Nous Portal proxies to OpenRouter — same wire format
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="anthropic/claude-opus-4.7",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
|
||||
def test_nous_portal_qwen_NOT_long_lived(self):
|
||||
# Portal Qwen still gets cache_control markers via the standard
|
||||
# system_and_3 5m layout (see _anthropic_prompt_cache_policy
|
||||
# tests above), but it must NOT ride the prefix_and_2 1h layout.
|
||||
# Alibaba DashScope (the upstream for every Qwen route, incl.
|
||||
# Portal -> OpenRouter -> Alibaba) only supports a single
|
||||
# ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently
|
||||
# ignored, so the high-value tools[-1] + system-prefix
|
||||
# breakpoints don't land. Stay on system_and_3 instead.
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="qwen3.6-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self):
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="qwen/qwen3.6-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_nous_portal_non_claude_rejected(self):
|
||||
# Portal long-lived cache scope is now Claude-only. Qwen
|
||||
# rejection is covered by the dedicated tests above; this
|
||||
# covers everything else (gpt, etc.).
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="openai/gpt-5.4",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_openrouter_non_claude_rejected(self):
|
||||
agent = _make_agent(
|
||||
provider="openrouter",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_mode="chat_completions",
|
||||
model="openai/gpt-5.4",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_third_party_anthropic_gateway_rejected(self):
|
||||
# MiniMax / Kimi / etc. — anthropic-wire but not in our validated list
|
||||
agent = _make_agent(
|
||||
provider="minimax",
|
||||
base_url="https://api.minimax.io/anthropic",
|
||||
api_mode="anthropic_messages",
|
||||
model="minimax-m2.7",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_alibaba_dashscope_rejected(self):
|
||||
agent = _make_agent(
|
||||
provider="alibaba",
|
||||
base_url="https://dashscope.aliyuncs.com/api/v1/anthropic",
|
||||
api_mode="anthropic_messages",
|
||||
model="qwen3.5-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_opencode_qwen_rejected(self):
|
||||
agent = _make_agent(
|
||||
provider="opencode-go",
|
||||
base_url="https://api.opencode-go.example/v1",
|
||||
api_mode="chat_completions",
|
||||
model="qwen3.6-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_fallback_target_evaluated_independently(self):
|
||||
# Starting on a non-supported provider, falling back to OpenRouter Claude
|
||||
agent = _make_agent(
|
||||
provider="minimax",
|
||||
base_url="https://api.minimax.io/anthropic",
|
||||
api_mode="anthropic_messages",
|
||||
model="minimax-m2.7",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache(
|
||||
provider="openrouter",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_mode="chat_completions",
|
||||
model="anthropic/claude-sonnet-4.6",
|
||||
) is True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue