mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)
The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
This commit is contained in:
parent
80374d4dd9
commit
b06e999302
8 changed files with 41 additions and 714 deletions
|
|
@ -1305,9 +1305,8 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
|
|||
),
|
||||
}
|
||||
# Forward cache_control marker when present on the OpenAI-format
|
||||
# tool dict (set by ``mark_tools_for_long_lived_cache``). Anthropic's
|
||||
# tools array supports cache_control on the last tool to cache the
|
||||
# entire schema cross-session.
|
||||
# tool dict. Anthropic's tools array supports cache_control on the
|
||||
# last tool to cache the entire schema cross-session.
|
||||
cache_control = t.get("cache_control")
|
||||
if isinstance(cache_control, dict):
|
||||
anthropic_tool["cache_control"] = dict(cache_control)
|
||||
|
|
|
|||
|
|
@ -1,25 +1,15 @@
|
|||
"""Anthropic prompt caching strategies.
|
||||
"""Anthropic prompt caching strategy.
|
||||
|
||||
Two layouts:
|
||||
|
||||
* ``system_and_3`` (default, used everywhere except the long-lived path):
|
||||
4 cache_control breakpoints — system prompt + last 3 non-system messages.
|
||||
All at the same TTL (5m or 1h). Reduces input token costs by ~75% on
|
||||
multi-turn conversations within a single session.
|
||||
|
||||
* ``prefix_and_2`` (Claude on Anthropic / OpenRouter / Nous Portal):
|
||||
4 breakpoints split across two TTL tiers — tools[-1] (1h) +
|
||||
stable system prefix (1h) + last 2 non-system messages (5m). The
|
||||
long-lived prefix is byte-stable across sessions for a given user
|
||||
config, so every fresh session reads the cached system+tools instead
|
||||
of re-paying for them. Within-session rolling window shrinks from 3
|
||||
messages to 2 to free the breakpoint budget.
|
||||
Single layout: ``system_and_3``. 4 cache_control breakpoints — system
|
||||
prompt + last 3 non-system messages, all at the same TTL (5m or 1h).
|
||||
Reduces input token costs by ~75% on multi-turn conversations within a
|
||||
single session.
|
||||
|
||||
Pure functions -- no class state, no AIAgent dependency.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = False) -> None:
|
||||
|
|
@ -87,115 +77,3 @@ def apply_anthropic_cache_control(
|
|||
_apply_cache_marker(messages[idx], marker, native_anthropic=native_anthropic)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def _mark_system_stable_block(
|
||||
messages: List[Dict[str, Any]],
|
||||
long_lived_marker: Dict[str, str],
|
||||
) -> bool:
|
||||
"""Mark the *first* content block of the system message with the 1h marker.
|
||||
|
||||
The system message is expected to have been split into multiple content
|
||||
blocks beforehand by the caller — block[0] is the cross-session-stable
|
||||
prefix, subsequent blocks carry context files + volatile suffix.
|
||||
Falls back to marking the whole system message as a single block when
|
||||
the message hasn't been split (preserves correctness on the fallback path).
|
||||
|
||||
Returns True when a marker was placed.
|
||||
"""
|
||||
if not messages or messages[0].get("role") != "system":
|
||||
return False
|
||||
|
||||
sys_msg = messages[0]
|
||||
content = sys_msg.get("content")
|
||||
|
||||
# Already a list of blocks → mark the first block.
|
||||
if isinstance(content, list) and content:
|
||||
first = content[0]
|
||||
if isinstance(first, dict):
|
||||
first["cache_control"] = long_lived_marker
|
||||
return True
|
||||
return False
|
||||
|
||||
# String content (no split) → cannot place a stable-prefix breakpoint
|
||||
# without changing the byte content. Caller is responsible for
|
||||
# splitting; if they didn't, fall through to envelope marker so we still
|
||||
# cache *something* for this turn.
|
||||
if isinstance(content, str) and content:
|
||||
sys_msg["content"] = [
|
||||
{"type": "text", "text": content, "cache_control": long_lived_marker}
|
||||
]
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def apply_anthropic_cache_control_long_lived(
|
||||
api_messages: List[Dict[str, Any]],
|
||||
long_lived_ttl: str = "1h",
|
||||
rolling_ttl: str = "5m",
|
||||
native_anthropic: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Apply prefix_and_2 caching: long-lived stable prefix + rolling window.
|
||||
|
||||
Layout (4 breakpoints total):
|
||||
* Stable system prefix (block[0]) → ``long_lived_ttl`` TTL
|
||||
* Last 2 non-system messages → ``rolling_ttl`` TTL each
|
||||
|
||||
NOTE: this function does NOT mark the tools array. Tools cache_control
|
||||
is attached separately (see ``mark_tools_for_long_lived_cache``) because
|
||||
tools live outside the messages list in the API payload.
|
||||
|
||||
The caller MUST have split the system message into ordered content
|
||||
blocks where block[0] is the cross-session-stable portion. If the system
|
||||
message is still a single string, it is wrapped into a single block and
|
||||
marked — this is correct, just less effective (the volatile suffix is
|
||||
not isolated, so the prefix invalidates per-session).
|
||||
|
||||
Returns:
|
||||
Deep copy of messages with cache_control breakpoints injected.
|
||||
"""
|
||||
messages = copy.deepcopy(api_messages)
|
||||
if not messages:
|
||||
return messages
|
||||
|
||||
long_marker = _build_marker(long_lived_ttl)
|
||||
rolling_marker = _build_marker(rolling_ttl)
|
||||
|
||||
placed_prefix = _mark_system_stable_block(messages, long_marker)
|
||||
|
||||
# Reserve 1 breakpoint for the system prefix (when placed); spend the
|
||||
# remaining 3 on the rolling tail. Anthropic max is 4 total —
|
||||
# tools[-1] (when marked) consumes the 4th, so we cap rolling at 2 here.
|
||||
rolling_budget = 2 if placed_prefix else 3
|
||||
non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
|
||||
for idx in non_sys[-rolling_budget:]:
|
||||
_apply_cache_marker(messages[idx], rolling_marker, native_anthropic=native_anthropic)
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def mark_tools_for_long_lived_cache(
|
||||
tools: Optional[List[Dict[str, Any]]],
|
||||
long_lived_ttl: str = "1h",
|
||||
) -> Optional[List[Dict[str, Any]]]:
|
||||
"""Attach cache_control to the last tool in the OpenAI-format tools list.
|
||||
|
||||
Anthropic prefix-cache order is ``tools → system → messages``. Marking
|
||||
the last tool dict caches the entire tools array (Anthropic's docs:
|
||||
"the marker is placed on the last block you want included in the cached
|
||||
prefix"). Marker is preserved across the OpenAI-wire boundary on
|
||||
OpenRouter and Nous Portal (which proxies to OpenRouter); on native
|
||||
Anthropic the marker is forwarded by ``convert_tools_to_anthropic``.
|
||||
|
||||
Returns a deep copy of the tools list with the marker attached, or the
|
||||
input unchanged when tools is empty/None. Pure function — does not
|
||||
mutate the input.
|
||||
"""
|
||||
if not tools:
|
||||
return tools
|
||||
out = copy.deepcopy(tools)
|
||||
last = out[-1]
|
||||
if isinstance(last, dict):
|
||||
last["cache_control"] = _build_marker(long_lived_ttl)
|
||||
return out
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue