fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)

The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
2026-05-24 05:41:40 +00:00 · 2026-05-12 20:46:04 -07:00 · 2026-05-12 20:46:04 -07:00 · b06e999302
commit b06e999302
parent 80374d4dd9
8 changed files with 41 additions and 714 deletions
--- a/tests/agent/test_prompt_caching.py
+++ b/tests/agent/test_prompt_caching.py
@ -6,8 +6,6 @@ import pytest
 from agent.prompt_caching import (
    _apply_cache_marker,
    apply_anthropic_cache_control,
-    apply_anthropic_cache_control_long_lived,
-    mark_tools_for_long_lived_cache,
 )


@ -143,132 +141,3 @@ class TestApplyAnthropicCacheControl:
            elif "cache_control" in msg:
                count += 1
        assert count <= 4
-
-
-class TestMarkToolsForLongLivedCache:
-    def test_returns_unchanged_for_empty_tools(self):
-        assert mark_tools_for_long_lived_cache(None) is None
-        assert mark_tools_for_long_lived_cache([]) == []
-
-    def test_marks_only_last_tool(self):
-        tools = [
-            {"type": "function", "function": {"name": "a"}},
-            {"type": "function", "function": {"name": "b"}},
-            {"type": "function", "function": {"name": "c"}},
-        ]
-        out = mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in out[0]
-        assert "cache_control" not in out[1]
-        assert out[2]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in tools[0]
-
-    def test_5m_ttl_drops_ttl_field(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        out = mark_tools_for_long_lived_cache(tools, long_lived_ttl="5m")
-        assert out[0]["cache_control"] == {"type": "ephemeral"}
-
-
-class TestApplyAnthropicCacheControlLongLived:
-    def test_empty_messages(self):
-        assert apply_anthropic_cache_control_long_lived([]) == []
-
-    def test_marks_first_block_of_split_system(self):
-        msgs = [
-            {"role": "system", "content": [
-                {"type": "text", "text": "STABLE"},
-                {"type": "text", "text": "CONTEXT"},
-                {"type": "text", "text": "VOLATILE"},
-            ]},
-            {"role": "user", "content": "msg1"},
-            {"role": "assistant", "content": "msg2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_blocks = out[0]["content"]
-        assert sys_blocks[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-        assert "cache_control" not in sys_blocks[1]
-        assert "cache_control" not in sys_blocks[2]
-
-    def test_rolling_marker_on_last_2_messages(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-            {"role": "user", "content": "u2"},
-            {"role": "assistant", "content": "a2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-
-        def has_marker(m):
-            c = m.get("content")
-            if isinstance(c, list) and c and isinstance(c[-1], dict):
-                return "cache_control" in c[-1]
-            return "cache_control" in m
-
-        # u1 and a1 (older messages) should NOT be marked
-        assert not has_marker(out[1])
-        assert not has_marker(out[2])
-        # u2 and a2 (last 2) SHOULD be marked
-        assert has_marker(out[3])
-        assert has_marker(out[4])
-
-    def test_rolling_marker_uses_5m_ttl(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(
-            msgs, long_lived_ttl="1h", rolling_ttl="5m",
-        )
-        # Last user message: cache_control on the wrapped text part should be 5m
-        last = out[-1]
-        c = last["content"]
-        assert isinstance(c, list)
-        assert c[-1]["cache_control"] == {"type": "ephemeral"}  # 5m has no ttl key
-
-    def test_string_system_falls_back_to_envelope_marker(self):
-        """When the caller didn't split the system message, we still place a marker."""
-        msgs = [
-            {"role": "system", "content": "Single string system"},
-            {"role": "user", "content": "u1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_content = out[0]["content"]
-        # Wrapped into a list and the (now sole) block gets the 1h marker
-        assert isinstance(sys_content, list)
-        assert sys_content[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-        ]
-        before = copy.deepcopy(msgs)
-        apply_anthropic_cache_control_long_lived(msgs)
-        assert msgs == before
-
-    def test_max_4_breakpoints_with_split_system(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}, {"type": "text", "text": "V"}]},
-        ] + [
-            {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg{i}"}
-            for i in range(10)
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        count = 0
-        for m in out:
-            c = m.get("content")
-            if isinstance(c, list):
-                for item in c:
-                    if isinstance(item, dict) and "cache_control" in item:
-                        count += 1
-            elif "cache_control" in m:
-                count += 1
-        # 1 system block + last 2 messages = 3 breakpoints from this function.
-        # tools[-1] is marked separately (not via this function), so a 4th
-        # breakpoint can be added at API-call time.
-        assert count == 3
--- a/tests/agent/test_prompt_caching_live.py
+++ b/tests/agent/test_prompt_caching_live.py
@ -1,112 +0,0 @@
-"""Live E2E: long-lived prefix caching on Claude via OpenRouter.
-
-Run only when LIVE_OR_KEY env var is set. Skipped under the normal hermetic
-test suite (which unsets credentials).
-"""
-import os, sys, tempfile, time, shutil, pytest
-
-
-# Probe for the key BEFORE conftest unsets it
-_LIVE_KEY = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("LIVE_OR_KEY")
-if not _LIVE_KEY:
-    # Try to read directly from .env
-    env_path = os.path.expanduser("~/.hermes/.env")
-    if os.path.exists(env_path):
-        with open(env_path) as f:
-            for line in f:
-                if line.startswith("OPENROUTER_API_KEY="):
-                    _LIVE_KEY = line.strip().split("=", 1)[1].strip().strip('"').strip("'")
-                    break
-
-
-pytestmark = pytest.mark.skipif(
-    not _LIVE_KEY,
-    reason="set OPENROUTER_API_KEY (or LIVE_OR_KEY) to run live cache test",
-)
-
-
-def test_long_lived_prefix_cache_e2e_openrouter(tmp_path, monkeypatch):
-    """Two AIAgent runs in fresh sessions: call 1 writes cache, call 2 reads it."""
-    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-    # The hermetic conftest unsets OPENROUTER_API_KEY — restore for this test
-    monkeypatch.setenv("OPENROUTER_API_KEY", _LIVE_KEY)
-
-    # Minimal config — but with enough toolset/guidance to exceed Anthropic's
-    # ~1024-token minimum-cacheable-prefix threshold. Anthropic silently
-    # ignores cache_control markers on small blocks.
-    import yaml
-    cfg_path = tmp_path / "config.yaml"
-    cfg_path.write_text(yaml.safe_dump({
-        "model": {"provider": "openrouter", "default": "anthropic/claude-haiku-4.5"},
-        "prompt_caching": {"long_lived_prefix": True, "long_lived_ttl": "1h", "cache_ttl": "5m"},
-        "agent": {"tool_use_enforcement": True},   # adds substantial guidance text
-        "memory": {"provider": ""},
-        "compression": {"enabled": False},
-    }))
-
-    from run_agent import AIAgent
-
-    def make_agent():
-        return AIAgent(
-            api_key=_LIVE_KEY,
-            base_url="https://openrouter.ai/api/v1",
-            provider="openrouter",
-            model="anthropic/claude-haiku-4.5",
-            api_mode="chat_completions",
-            # Use the default toolset roster — the tools array (~13k tokens
-            # for ~35 tools) is what carries the bulk of the cross-session
-            # cache value. With a tiny toolset the cached prefix can fall
-            # below Anthropic Haiku's 2048-token minimum cacheable size and
-            # the marker is silently ignored.
-            enabled_toolsets=None,
-            quiet_mode=True,
-            skip_context_files=True,
-            skip_memory=True,
-            save_trajectories=False,
-        )
-
-    a1 = make_agent()
-    assert a1._use_prompt_caching is True, "policy should enable caching for Claude on OR"
-    assert a1._use_long_lived_prefix_cache is True, "long-lived path should activate"
-    parts = a1._build_system_prompt_parts()
-    print(f"\nstable={len(parts['stable']):,} ctx={len(parts['context']):,} volatile={len(parts['volatile']):,} chars")
-    print(f"tool count: {len(a1.tools or [])}")
-
-    # Use distinct user messages each call so OpenRouter's response cache
-    # doesn't short-circuit the upstream Anthropic call (we need real
-    # Anthropic billing visibility to verify cache_creation/cache_read).
-    USER_1 = "Reply with the single word ALPHA."
-    USER_2 = "Reply with the single word BRAVO."
-
-    print("\n--- Call 1 (cold) ---")
-    r1 = a1.run_conversation(USER_1, conversation_history=[])
-    print(f"final_response[:80]: {(r1.get('final_response') or '')[:80]!r}")
-    cr1 = a1.session_cache_read_tokens
-    cw1 = a1.session_cache_write_tokens
-    print(f"call1: cache_read={cr1} cache_write={cw1}")
-
-    # Wait so cache settles, then fresh agent (NEW SESSION) for cross-session read
-    time.sleep(2)
-    a2 = make_agent()
-    assert a2.session_id != a1.session_id, "second agent must have a new session"
-
-    print("\n--- Call 2 (warm, NEW session, different user msg) ---")
-    r2 = a2.run_conversation(USER_2, conversation_history=[])
-    print(f"final_response[:80]: {(r2.get('final_response') or '')[:80]!r}")
-    cr2 = a2.session_cache_read_tokens
-    cw2 = a2.session_cache_write_tokens
-    print(f"call2: cache_read={cr2} cache_write={cw2}")
-
-    print(f"\n=== VERDICT ===")
-    print(f"  call1 wrote {cw1:,} cache tokens, read {cr1:,}")
-    print(f"  call2 wrote {cw2:,} cache tokens, read {cr2:,}")
-    if cw1:
-        print(f"  cross-session read fraction: cr2/cw1 = {cr2/cw1:.2%}")
-
-    # Assertions
-    assert cw1 > 0, f"call 1 must write cache (got {cw1}); long-lived layout not reaching wire"
-    assert cr2 > 0, (
-        f"call 2 must read cache cross-session (got {cr2}); "
-        f"stable prefix is not byte-stable across sessions"
-    )
-    assert cr2 >= 1000, f"cache_read on call 2 ({cr2}) too small to indicate real reuse"