mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)
The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
This commit is contained in:
parent
80374d4dd9
commit
b06e999302
8 changed files with 41 additions and 714 deletions
234
run_agent.py
234
run_agent.py
|
|
@ -1454,15 +1454,6 @@ class AIAgent:
|
|||
# 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long
|
||||
# sessions with >5-minute pauses between turns (#14971).
|
||||
self._cache_ttl = "5m"
|
||||
# Long-lived prefix caching: when enabled and supported by the
|
||||
# current provider, splits the system prompt into a stable prefix
|
||||
# (cached cross-session at 1h TTL) and a volatile suffix
|
||||
# (memory/timestamp — never cached), and attaches a 1h cache_control
|
||||
# marker to the last tool in the schema array. Restricted to
|
||||
# Claude on Anthropic / OpenRouter / Nous Portal; see
|
||||
# ``_supports_long_lived_anthropic_cache``.
|
||||
self._use_long_lived_prefix_cache = False
|
||||
self._long_lived_cache_ttl = "1h"
|
||||
try:
|
||||
from hermes_cli.config import load_config as _load_pc_cfg
|
||||
|
||||
|
|
@ -1470,12 +1461,6 @@ class AIAgent:
|
|||
_ttl = _pc_cfg.get("cache_ttl", "5m")
|
||||
if _ttl in {"5m", "1h"}:
|
||||
self._cache_ttl = _ttl
|
||||
_ll_enabled = _pc_cfg.get("long_lived_prefix", True)
|
||||
_ll_ttl = _pc_cfg.get("long_lived_ttl", "1h")
|
||||
if _ll_ttl in ("5m", "1h"):
|
||||
self._long_lived_cache_ttl = _ll_ttl
|
||||
if _ll_enabled and self._use_prompt_caching and self._supports_long_lived_anthropic_cache():
|
||||
self._use_long_lived_prefix_cache = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -2480,7 +2465,6 @@ class AIAgent:
|
|||
"client_kwargs": dict(self._client_kwargs),
|
||||
"use_prompt_caching": self._use_prompt_caching,
|
||||
"use_native_cache_layout": self._use_native_cache_layout,
|
||||
"use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
|
||||
# Context engine state that _try_activate_fallback() overwrites.
|
||||
# Use getattr for model/base_url/api_key/provider since plugin
|
||||
# engines may not have these (they're ContextCompressor-specific).
|
||||
|
|
@ -2716,15 +2700,6 @@ class AIAgent:
|
|||
model=new_model,
|
||||
)
|
||||
)
|
||||
self._use_long_lived_prefix_cache = bool(
|
||||
self._use_prompt_caching
|
||||
and self._supports_long_lived_anthropic_cache(
|
||||
provider=new_provider,
|
||||
base_url=self.base_url,
|
||||
api_mode=api_mode,
|
||||
model=new_model,
|
||||
)
|
||||
)
|
||||
|
||||
# ── LM Studio: preload before probing context length ──
|
||||
self._ensure_lmstudio_runtime_loaded()
|
||||
|
|
@ -2773,7 +2748,6 @@ class AIAgent:
|
|||
"client_kwargs": dict(self._client_kwargs),
|
||||
"use_prompt_caching": self._use_prompt_caching,
|
||||
"use_native_cache_layout": self._use_native_cache_layout,
|
||||
"use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
|
||||
"compressor_model": getattr(_cc, "model", self.model) if _cc else self.model,
|
||||
"compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url,
|
||||
"compressor_api_key": getattr(_cc, "api_key", "") if _cc else "",
|
||||
|
|
@ -3584,80 +3558,6 @@ class AIAgent:
|
|||
|
||||
return False, False
|
||||
|
||||
def _supports_long_lived_anthropic_cache(
|
||||
self,
|
||||
*,
|
||||
provider: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
api_mode: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""Decide whether the long-lived (1h cross-session) cache layout applies.
|
||||
|
||||
Narrower than ``_anthropic_prompt_cache_policy`` — only enabled
|
||||
for Claude models on the four endpoints whose cross-session
|
||||
cache_control behavior we have explicitly validated:
|
||||
|
||||
* Native Anthropic API (``api_mode == 'anthropic_messages'`` +
|
||||
host ``api.anthropic.com``)
|
||||
* Anthropic OAuth subscription (same transport as native API)
|
||||
* OpenRouter (``base_url`` contains ``openrouter.ai``)
|
||||
* Nous Portal (``base_url`` contains ``nousresearch`` — proxies
|
||||
to OpenRouter, so identical wire-format)
|
||||
|
||||
All four honour ``cache_control`` on both the tools array and the
|
||||
first system content block, and bill cross-session cache reads at
|
||||
the documented 0.1× rate.
|
||||
|
||||
Other endpoints covered by the standard ``system_and_3`` policy
|
||||
(third-party Anthropic gateways, MiniMax, opencode-go Qwen, etc.)
|
||||
keep that layout — they support cache_control but their behavior
|
||||
with mixed-TTL multi-block system content has not been validated
|
||||
against this codebase.
|
||||
"""
|
||||
eff_provider = (provider if provider is not None else self.provider) or ""
|
||||
eff_base_url = base_url if base_url is not None else (self.base_url or "")
|
||||
eff_api_mode = api_mode if api_mode is not None else (self.api_mode or "")
|
||||
eff_model = (model if model is not None else self.model) or ""
|
||||
|
||||
model_lower = eff_model.lower()
|
||||
is_claude = "claude" in model_lower
|
||||
is_nous_portal = "nousresearch" in eff_base_url.lower()
|
||||
|
||||
# Nous Portal Claude rides the 1h prefix_and_2 layout (Portal
|
||||
# proxies to OpenRouter, which honours ttl=1h on Anthropic
|
||||
# routes). Qwen does NOT — Alibaba DashScope (the upstream for
|
||||
# all Qwen routes, including Portal -> OpenRouter -> Alibaba)
|
||||
# documents a single ``ephemeral`` TTL of 5 minutes; ttl="1h"
|
||||
# on Qwen markers is silently ignored upstream, so the
|
||||
# high-value tools[-1] + system-prefix breakpoints never land
|
||||
# and only the 5m rolling-window markers on the last 2 messages
|
||||
# get cached. Portal Qwen still gets cache_control via
|
||||
# _anthropic_prompt_cache_policy returning (True, False) — it
|
||||
# just rides the standard system_and_3 5m layout instead of the
|
||||
# mismatched prefix_and_2 1h layout.
|
||||
if is_nous_portal and is_claude:
|
||||
return True
|
||||
|
||||
if not is_claude:
|
||||
return False
|
||||
|
||||
# Native Anthropic + Anthropic OAuth subscription
|
||||
if eff_api_mode == "anthropic_messages":
|
||||
if eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com":
|
||||
return True
|
||||
|
||||
# OpenRouter
|
||||
if base_url_host_matches(eff_base_url, "openrouter.ai"):
|
||||
return True
|
||||
|
||||
# Nous Portal — front-ends OpenRouter behind the scenes; identical
|
||||
# wire format and cache_control semantics.
|
||||
if is_nous_portal:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _model_requires_responses_api(model: str) -> bool:
|
||||
"""Return True for models that require the Responses API path.
|
||||
|
|
@ -5906,26 +5806,19 @@ class AIAgent:
|
|||
"""Assemble the system prompt as three ordered parts.
|
||||
|
||||
Returns a dict with three keys:
|
||||
* ``stable`` — content that is byte-stable across sessions for a
|
||||
given user config: identity, tool guidance, skills prompt,
|
||||
* ``stable`` — identity, tool guidance, skills prompt,
|
||||
environment hints, platform hints, model-family operational
|
||||
guidance. Eligible for cross-session 1h prompt caching when
|
||||
placed as a separate Anthropic content block (see
|
||||
``apply_anthropic_cache_control_long_lived``).
|
||||
* ``context`` — context files (AGENTS.md, .cursorrules, etc.) and
|
||||
caller-supplied system_message. Stable within a session but may
|
||||
change between sessions when files are edited or the cwd
|
||||
differs. Cached within-session via the rolling messages
|
||||
breakpoint (5m TTL); not promoted to the long-lived tier so
|
||||
edits don't poison the cross-session cache.
|
||||
* ``volatile`` — content that changes on most turns/sessions:
|
||||
memory snapshot, user profile, external memory provider block,
|
||||
timestamp line. Never marked for caching.
|
||||
guidance.
|
||||
* ``context`` — context files (AGENTS.md, .cursorrules, etc.)
|
||||
and caller-supplied system_message.
|
||||
* ``volatile`` — memory snapshot, user profile, external
|
||||
memory provider block, timestamp line.
|
||||
|
||||
Joined ``stable\\n\\ncontext\\n\\nvolatile`` produces the same
|
||||
logical content the old single-string builder produced, with the
|
||||
guarantee that volatile content is at the end (cache-friendly
|
||||
ordering for any provider that does prefix caching).
|
||||
Joined into a single string by ``_build_system_prompt`` and
|
||||
cached on ``_cached_system_prompt`` for the lifetime of the
|
||||
AIAgent. Hermes never re-renders parts of this string mid-
|
||||
session — that's the only way to keep upstream prompt caches
|
||||
warm across turns.
|
||||
"""
|
||||
# ── Stable tier ────────────────────────────────────────────────
|
||||
stable_parts: List[str] = []
|
||||
|
|
@ -6127,9 +6020,10 @@ class AIAgent:
|
|||
|
||||
Layers are ordered cache-friendly: stable identity/guidance first,
|
||||
then session-stable context files, then per-call volatile content
|
||||
(memory, USER profile, timestamp). The split is exposed via
|
||||
``_build_system_prompt_parts`` for the long-lived prompt-caching
|
||||
path (Claude on Anthropic / OpenRouter / Nous Portal).
|
||||
(memory, USER profile, timestamp). The whole string is treated as
|
||||
one cached block — Hermes never rebuilds or reinjects parts of it
|
||||
mid-session, which is the only way to keep upstream prompt caches
|
||||
warm across turns.
|
||||
"""
|
||||
parts = self._build_system_prompt_parts(system_message=system_message)
|
||||
joined = "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p)
|
||||
|
|
@ -8896,15 +8790,6 @@ class AIAgent:
|
|||
model=fb_model,
|
||||
)
|
||||
)
|
||||
self._use_long_lived_prefix_cache = bool(
|
||||
self._use_prompt_caching
|
||||
and self._supports_long_lived_anthropic_cache(
|
||||
provider=fb_provider,
|
||||
base_url=fb_base_url,
|
||||
api_mode=fb_api_mode,
|
||||
model=fb_model,
|
||||
)
|
||||
)
|
||||
|
||||
# LM Studio: preload before probing the fallback's context length.
|
||||
self._ensure_lmstudio_runtime_loaded()
|
||||
|
|
@ -8981,16 +8866,6 @@ class AIAgent:
|
|||
"use_native_cache_layout",
|
||||
self.api_mode == "anthropic_messages" and self.provider == "anthropic",
|
||||
)
|
||||
# Long-lived prefix flag was added later — restore False on
|
||||
# snapshots predating the new field, then re-evaluate against
|
||||
# the restored provider/model in case the user had it enabled.
|
||||
self._use_long_lived_prefix_cache = rt.get(
|
||||
"use_long_lived_prefix_cache",
|
||||
bool(
|
||||
self._use_prompt_caching
|
||||
and self._supports_long_lived_anthropic_cache()
|
||||
),
|
||||
)
|
||||
|
||||
# ── Rebuild client for the primary provider ──
|
||||
if self.api_mode == "anthropic_messages":
|
||||
|
|
@ -9568,19 +9443,7 @@ class AIAgent:
|
|||
|
||||
def _build_api_kwargs(self, api_messages: list) -> dict:
|
||||
"""Build the keyword arguments dict for the active API mode."""
|
||||
# Resolve the tools array exactly once. When the long-lived
|
||||
# prefix-cache layout is active (Claude on Anthropic / OpenRouter
|
||||
# / Nous Portal), attach a 1h cache_control marker to the last
|
||||
# tool — this caches the entire tools array cross-session via
|
||||
# Anthropic's tools→system→messages prefix order. The function
|
||||
# returns a deep copy, so self.tools is never mutated.
|
||||
if self._use_long_lived_prefix_cache and self.tools:
|
||||
from agent.prompt_caching import mark_tools_for_long_lived_cache
|
||||
tools_for_api = mark_tools_for_long_lived_cache(
|
||||
self.tools, long_lived_ttl=self._long_lived_cache_ttl,
|
||||
)
|
||||
else:
|
||||
tools_for_api = self.tools
|
||||
tools_for_api = self.tools
|
||||
|
||||
if self.api_mode == "anthropic_messages":
|
||||
_transport = self._get_transport()
|
||||
|
|
@ -12440,36 +12303,21 @@ class AIAgent:
|
|||
# External recall context is injected into the user message, not the system
|
||||
# prompt, so the stable cache prefix remains unchanged.
|
||||
#
|
||||
# When the long-lived prefix-cache layout is active (Claude on
|
||||
# Anthropic / OpenRouter / Nous Portal), we build the system
|
||||
# message as a *list of content blocks*: [stable, context,
|
||||
# volatile, ephemeral?]. Block 0 (stable) gets the 1h
|
||||
# cache_control marker further down via
|
||||
# apply_anthropic_cache_control_long_lived; blocks 1-3 are
|
||||
# cached only via the rolling messages window at 5m.
|
||||
# NOTE: Plugin context from pre_llm_call hooks is injected into the
|
||||
# user message (see injection block above), NOT the system prompt.
|
||||
# This is intentional — system prompt modifications break the prompt
|
||||
# cache prefix. The system prompt is reserved for Hermes internals.
|
||||
if self._use_long_lived_prefix_cache:
|
||||
_sys_parts = self._build_system_prompt_parts(system_message=system_message)
|
||||
_sys_blocks: list = []
|
||||
if _sys_parts.get("stable"):
|
||||
_sys_blocks.append({"type": "text", "text": _sys_parts["stable"]})
|
||||
if _sys_parts.get("context"):
|
||||
_sys_blocks.append({"type": "text", "text": _sys_parts["context"]})
|
||||
if _sys_parts.get("volatile"):
|
||||
_sys_blocks.append({"type": "text", "text": _sys_parts["volatile"]})
|
||||
if self.ephemeral_system_prompt:
|
||||
_sys_blocks.append({"type": "text", "text": self.ephemeral_system_prompt})
|
||||
if _sys_blocks:
|
||||
api_messages = [{"role": "system", "content": _sys_blocks}] + api_messages
|
||||
else:
|
||||
effective_system = active_system_prompt or ""
|
||||
if self.ephemeral_system_prompt:
|
||||
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
|
||||
if effective_system:
|
||||
api_messages = [{"role": "system", "content": effective_system}] + api_messages
|
||||
#
|
||||
# Hermes invariant: the system prompt is built ONCE per session
|
||||
# (cached on ``_cached_system_prompt``) and replayed verbatim on
|
||||
# every turn. We send it as a single content string so the
|
||||
# bytes are byte-stable across turns and upstream prompt caches
|
||||
# stay warm.
|
||||
effective_system = active_system_prompt or ""
|
||||
if self.ephemeral_system_prompt:
|
||||
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
|
||||
if effective_system:
|
||||
api_messages = [{"role": "system", "content": effective_system}] + api_messages
|
||||
|
||||
# Inject ephemeral prefill messages right after the system prompt
|
||||
# but before conversation history. Same API-call-time-only pattern.
|
||||
|
|
@ -12483,29 +12331,13 @@ class AIAgent:
|
|||
# gateways. Auto-detected: if ``_use_prompt_caching`` is set,
|
||||
# inject cache_control breakpoints (system + last 3 messages)
|
||||
# to reduce input token costs by ~75% on multi-turn
|
||||
# conversations. Layout is chosen per endpoint by
|
||||
# ``_anthropic_prompt_cache_policy``.
|
||||
#
|
||||
# Long-lived prefix layout (prefix_and_2): stable system block
|
||||
# gets 1h marker + last 2 messages get 5m markers. Tools
|
||||
# array's last entry is marked separately at API-call kwargs
|
||||
# build time (see ``_build_api_kwargs`` and
|
||||
# ``mark_tools_for_long_lived_cache``).
|
||||
# conversations.
|
||||
if self._use_prompt_caching:
|
||||
if self._use_long_lived_prefix_cache:
|
||||
from agent.prompt_caching import apply_anthropic_cache_control_long_lived
|
||||
api_messages = apply_anthropic_cache_control_long_lived(
|
||||
api_messages,
|
||||
long_lived_ttl=self._long_lived_cache_ttl,
|
||||
rolling_ttl=self._cache_ttl,
|
||||
native_anthropic=self._use_native_cache_layout,
|
||||
)
|
||||
else:
|
||||
api_messages = apply_anthropic_cache_control(
|
||||
api_messages,
|
||||
cache_ttl=self._cache_ttl,
|
||||
native_anthropic=self._use_native_cache_layout,
|
||||
)
|
||||
api_messages = apply_anthropic_cache_control(
|
||||
api_messages,
|
||||
cache_ttl=self._cache_ttl,
|
||||
native_anthropic=self._use_native_cache_layout,
|
||||
)
|
||||
|
||||
# Safety net: strip orphaned tool results / add stubs for missing
|
||||
# results before sending to the API. Runs unconditionally — not
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue