mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-26 06:01:49 +00:00
fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)
The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
This commit is contained in:
parent
80374d4dd9
commit
b06e999302
8 changed files with 41 additions and 714 deletions
|
|
@ -6,8 +6,6 @@ import pytest
|
|||
from agent.prompt_caching import (
|
||||
_apply_cache_marker,
|
||||
apply_anthropic_cache_control,
|
||||
apply_anthropic_cache_control_long_lived,
|
||||
mark_tools_for_long_lived_cache,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -143,132 +141,3 @@ class TestApplyAnthropicCacheControl:
|
|||
elif "cache_control" in msg:
|
||||
count += 1
|
||||
assert count <= 4
|
||||
|
||||
|
||||
class TestMarkToolsForLongLivedCache:
|
||||
def test_returns_unchanged_for_empty_tools(self):
|
||||
assert mark_tools_for_long_lived_cache(None) is None
|
||||
assert mark_tools_for_long_lived_cache([]) == []
|
||||
|
||||
def test_marks_only_last_tool(self):
|
||||
tools = [
|
||||
{"type": "function", "function": {"name": "a"}},
|
||||
{"type": "function", "function": {"name": "b"}},
|
||||
{"type": "function", "function": {"name": "c"}},
|
||||
]
|
||||
out = mark_tools_for_long_lived_cache(tools)
|
||||
assert "cache_control" not in out[0]
|
||||
assert "cache_control" not in out[1]
|
||||
assert out[2]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
|
||||
|
||||
def test_does_not_mutate_input(self):
|
||||
tools = [{"type": "function", "function": {"name": "a"}}]
|
||||
mark_tools_for_long_lived_cache(tools)
|
||||
assert "cache_control" not in tools[0]
|
||||
|
||||
def test_5m_ttl_drops_ttl_field(self):
|
||||
tools = [{"type": "function", "function": {"name": "a"}}]
|
||||
out = mark_tools_for_long_lived_cache(tools, long_lived_ttl="5m")
|
||||
assert out[0]["cache_control"] == {"type": "ephemeral"}
|
||||
|
||||
|
||||
class TestApplyAnthropicCacheControlLongLived:
|
||||
def test_empty_messages(self):
|
||||
assert apply_anthropic_cache_control_long_lived([]) == []
|
||||
|
||||
def test_marks_first_block_of_split_system(self):
|
||||
msgs = [
|
||||
{"role": "system", "content": [
|
||||
{"type": "text", "text": "STABLE"},
|
||||
{"type": "text", "text": "CONTEXT"},
|
||||
{"type": "text", "text": "VOLATILE"},
|
||||
]},
|
||||
{"role": "user", "content": "msg1"},
|
||||
{"role": "assistant", "content": "msg2"},
|
||||
]
|
||||
out = apply_anthropic_cache_control_long_lived(msgs)
|
||||
sys_blocks = out[0]["content"]
|
||||
assert sys_blocks[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
|
||||
assert "cache_control" not in sys_blocks[1]
|
||||
assert "cache_control" not in sys_blocks[2]
|
||||
|
||||
def test_rolling_marker_on_last_2_messages(self):
|
||||
msgs = [
|
||||
{"role": "system", "content": [{"type": "text", "text": "S"}]},
|
||||
{"role": "user", "content": "u1"},
|
||||
{"role": "assistant", "content": "a1"},
|
||||
{"role": "user", "content": "u2"},
|
||||
{"role": "assistant", "content": "a2"},
|
||||
]
|
||||
out = apply_anthropic_cache_control_long_lived(msgs)
|
||||
|
||||
def has_marker(m):
|
||||
c = m.get("content")
|
||||
if isinstance(c, list) and c and isinstance(c[-1], dict):
|
||||
return "cache_control" in c[-1]
|
||||
return "cache_control" in m
|
||||
|
||||
# u1 and a1 (older messages) should NOT be marked
|
||||
assert not has_marker(out[1])
|
||||
assert not has_marker(out[2])
|
||||
# u2 and a2 (last 2) SHOULD be marked
|
||||
assert has_marker(out[3])
|
||||
assert has_marker(out[4])
|
||||
|
||||
def test_rolling_marker_uses_5m_ttl(self):
|
||||
msgs = [
|
||||
{"role": "system", "content": [{"type": "text", "text": "S"}]},
|
||||
{"role": "user", "content": "u1"},
|
||||
{"role": "assistant", "content": "a1"},
|
||||
]
|
||||
out = apply_anthropic_cache_control_long_lived(
|
||||
msgs, long_lived_ttl="1h", rolling_ttl="5m",
|
||||
)
|
||||
# Last user message: cache_control on the wrapped text part should be 5m
|
||||
last = out[-1]
|
||||
c = last["content"]
|
||||
assert isinstance(c, list)
|
||||
assert c[-1]["cache_control"] == {"type": "ephemeral"} # 5m has no ttl key
|
||||
|
||||
def test_string_system_falls_back_to_envelope_marker(self):
|
||||
"""When the caller didn't split the system message, we still place a marker."""
|
||||
msgs = [
|
||||
{"role": "system", "content": "Single string system"},
|
||||
{"role": "user", "content": "u1"},
|
||||
]
|
||||
out = apply_anthropic_cache_control_long_lived(msgs)
|
||||
sys_content = out[0]["content"]
|
||||
# Wrapped into a list and the (now sole) block gets the 1h marker
|
||||
assert isinstance(sys_content, list)
|
||||
assert sys_content[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
|
||||
|
||||
def test_does_not_mutate_input(self):
|
||||
msgs = [
|
||||
{"role": "system", "content": [{"type": "text", "text": "S"}]},
|
||||
{"role": "user", "content": "u1"},
|
||||
]
|
||||
before = copy.deepcopy(msgs)
|
||||
apply_anthropic_cache_control_long_lived(msgs)
|
||||
assert msgs == before
|
||||
|
||||
def test_max_4_breakpoints_with_split_system(self):
|
||||
msgs = [
|
||||
{"role": "system", "content": [{"type": "text", "text": "S"}, {"type": "text", "text": "V"}]},
|
||||
] + [
|
||||
{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg{i}"}
|
||||
for i in range(10)
|
||||
]
|
||||
out = apply_anthropic_cache_control_long_lived(msgs)
|
||||
count = 0
|
||||
for m in out:
|
||||
c = m.get("content")
|
||||
if isinstance(c, list):
|
||||
for item in c:
|
||||
if isinstance(item, dict) and "cache_control" in item:
|
||||
count += 1
|
||||
elif "cache_control" in m:
|
||||
count += 1
|
||||
# 1 system block + last 2 messages = 3 breakpoints from this function.
|
||||
# tools[-1] is marked separately (not via this function), so a 4th
|
||||
# breakpoint can be added at API-call time.
|
||||
assert count == 3
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue