mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
79 lines
2.4 KiB
Python
79 lines
2.4 KiB
Python
"""Anthropic prompt caching strategy.
|
|
|
|
Single layout: ``system_and_3``. 4 cache_control breakpoints — system
|
|
prompt + last 3 non-system messages, all at the same TTL (5m or 1h).
|
|
Reduces input token costs by ~75% on multi-turn conversations within a
|
|
single session.
|
|
|
|
Pure functions -- no class state, no AIAgent dependency.
|
|
"""
|
|
|
|
import copy
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = False) -> None:
|
|
"""Add cache_control to a single message, handling all format variations."""
|
|
role = msg.get("role", "")
|
|
content = msg.get("content")
|
|
|
|
if role == "tool":
|
|
if native_anthropic:
|
|
msg["cache_control"] = cache_marker
|
|
return
|
|
|
|
if content is None or content == "":
|
|
msg["cache_control"] = cache_marker
|
|
return
|
|
|
|
if isinstance(content, str):
|
|
msg["content"] = [
|
|
{"type": "text", "text": content, "cache_control": cache_marker}
|
|
]
|
|
return
|
|
|
|
if isinstance(content, list) and content:
|
|
last = content[-1]
|
|
if isinstance(last, dict):
|
|
last["cache_control"] = cache_marker
|
|
|
|
|
|
def _build_marker(ttl: str) -> Dict[str, str]:
|
|
"""Build a cache_control marker dict for the given TTL ('5m' or '1h')."""
|
|
marker: Dict[str, str] = {"type": "ephemeral"}
|
|
if ttl == "1h":
|
|
marker["ttl"] = "1h"
|
|
return marker
|
|
|
|
|
|
def apply_anthropic_cache_control(
|
|
api_messages: List[Dict[str, Any]],
|
|
cache_ttl: str = "5m",
|
|
native_anthropic: bool = False,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Apply system_and_3 caching strategy to messages for Anthropic models.
|
|
|
|
Places up to 4 cache_control breakpoints: system prompt + last 3 non-system
|
|
messages, all at the same TTL.
|
|
|
|
Returns:
|
|
Deep copy of messages with cache_control breakpoints injected.
|
|
"""
|
|
messages = copy.deepcopy(api_messages)
|
|
if not messages:
|
|
return messages
|
|
|
|
marker = _build_marker(cache_ttl)
|
|
|
|
breakpoints_used = 0
|
|
|
|
if messages[0].get("role") == "system":
|
|
_apply_cache_marker(messages[0], marker, native_anthropic=native_anthropic)
|
|
breakpoints_used += 1
|
|
|
|
remaining = 4 - breakpoints_used
|
|
non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
|
|
for idx in non_sys[-remaining:]:
|
|
_apply_cache_marker(messages[idx], marker, native_anthropic=native_anthropic)
|
|
|
|
return messages
|