diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 3919c8565b2..4b1134a4c0b 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -1305,9 +1305,8 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]: ), } # Forward cache_control marker when present on the OpenAI-format - # tool dict (set by ``mark_tools_for_long_lived_cache``). Anthropic's - # tools array supports cache_control on the last tool to cache the - # entire schema cross-session. + # tool dict. Anthropic's tools array supports cache_control on the + # last tool to cache the entire schema cross-session. cache_control = t.get("cache_control") if isinstance(cache_control, dict): anthropic_tool["cache_control"] = dict(cache_control) diff --git a/agent/prompt_caching.py b/agent/prompt_caching.py index 4829c96b332..a73d6e113d9 100644 --- a/agent/prompt_caching.py +++ b/agent/prompt_caching.py @@ -1,25 +1,15 @@ -"""Anthropic prompt caching strategies. +"""Anthropic prompt caching strategy. -Two layouts: - -* ``system_and_3`` (default, used everywhere except the long-lived path): - 4 cache_control breakpoints — system prompt + last 3 non-system messages. - All at the same TTL (5m or 1h). Reduces input token costs by ~75% on - multi-turn conversations within a single session. - -* ``prefix_and_2`` (Claude on Anthropic / OpenRouter / Nous Portal): - 4 breakpoints split across two TTL tiers — tools[-1] (1h) + - stable system prefix (1h) + last 2 non-system messages (5m). The - long-lived prefix is byte-stable across sessions for a given user - config, so every fresh session reads the cached system+tools instead - of re-paying for them. Within-session rolling window shrinks from 3 - messages to 2 to free the breakpoint budget. +Single layout: ``system_and_3``. 4 cache_control breakpoints — system +prompt + last 3 non-system messages, all at the same TTL (5m or 1h). +Reduces input token costs by ~75% on multi-turn conversations within a +single session. Pure functions -- no class state, no AIAgent dependency. """ import copy -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = False) -> None: @@ -87,115 +77,3 @@ def apply_anthropic_cache_control( _apply_cache_marker(messages[idx], marker, native_anthropic=native_anthropic) return messages - - -def _mark_system_stable_block( - messages: List[Dict[str, Any]], - long_lived_marker: Dict[str, str], -) -> bool: - """Mark the *first* content block of the system message with the 1h marker. - - The system message is expected to have been split into multiple content - blocks beforehand by the caller — block[0] is the cross-session-stable - prefix, subsequent blocks carry context files + volatile suffix. - Falls back to marking the whole system message as a single block when - the message hasn't been split (preserves correctness on the fallback path). - - Returns True when a marker was placed. - """ - if not messages or messages[0].get("role") != "system": - return False - - sys_msg = messages[0] - content = sys_msg.get("content") - - # Already a list of blocks → mark the first block. - if isinstance(content, list) and content: - first = content[0] - if isinstance(first, dict): - first["cache_control"] = long_lived_marker - return True - return False - - # String content (no split) → cannot place a stable-prefix breakpoint - # without changing the byte content. Caller is responsible for - # splitting; if they didn't, fall through to envelope marker so we still - # cache *something* for this turn. - if isinstance(content, str) and content: - sys_msg["content"] = [ - {"type": "text", "text": content, "cache_control": long_lived_marker} - ] - return True - - return False - - -def apply_anthropic_cache_control_long_lived( - api_messages: List[Dict[str, Any]], - long_lived_ttl: str = "1h", - rolling_ttl: str = "5m", - native_anthropic: bool = False, -) -> List[Dict[str, Any]]: - """Apply prefix_and_2 caching: long-lived stable prefix + rolling window. - - Layout (4 breakpoints total): - * Stable system prefix (block[0]) → ``long_lived_ttl`` TTL - * Last 2 non-system messages → ``rolling_ttl`` TTL each - - NOTE: this function does NOT mark the tools array. Tools cache_control - is attached separately (see ``mark_tools_for_long_lived_cache``) because - tools live outside the messages list in the API payload. - - The caller MUST have split the system message into ordered content - blocks where block[0] is the cross-session-stable portion. If the system - message is still a single string, it is wrapped into a single block and - marked — this is correct, just less effective (the volatile suffix is - not isolated, so the prefix invalidates per-session). - - Returns: - Deep copy of messages with cache_control breakpoints injected. - """ - messages = copy.deepcopy(api_messages) - if not messages: - return messages - - long_marker = _build_marker(long_lived_ttl) - rolling_marker = _build_marker(rolling_ttl) - - placed_prefix = _mark_system_stable_block(messages, long_marker) - - # Reserve 1 breakpoint for the system prefix (when placed); spend the - # remaining 3 on the rolling tail. Anthropic max is 4 total — - # tools[-1] (when marked) consumes the 4th, so we cap rolling at 2 here. - rolling_budget = 2 if placed_prefix else 3 - non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"] - for idx in non_sys[-rolling_budget:]: - _apply_cache_marker(messages[idx], rolling_marker, native_anthropic=native_anthropic) - - return messages - - -def mark_tools_for_long_lived_cache( - tools: Optional[List[Dict[str, Any]]], - long_lived_ttl: str = "1h", -) -> Optional[List[Dict[str, Any]]]: - """Attach cache_control to the last tool in the OpenAI-format tools list. - - Anthropic prefix-cache order is ``tools → system → messages``. Marking - the last tool dict caches the entire tools array (Anthropic's docs: - "the marker is placed on the last block you want included in the cached - prefix"). Marker is preserved across the OpenAI-wire boundary on - OpenRouter and Nous Portal (which proxies to OpenRouter); on native - Anthropic the marker is forwarded by ``convert_tools_to_anthropic``. - - Returns a deep copy of the tools list with the marker attached, or the - input unchanged when tools is empty/None. Pure function — does not - mutate the input. - """ - if not tools: - return tools - out = copy.deepcopy(tools) - last = out[-1] - if isinstance(last, dict): - last["cache_control"] = _build_marker(long_lived_ttl) - return out diff --git a/hermes_cli/config.py b/hermes_cli/config.py index dc3e414948b..4c2596594ec 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -735,15 +735,8 @@ DEFAULT_CONFIG = { # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API). # cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored. - # long_lived_prefix: when true (default), Claude on Anthropic / OpenRouter / Nous - # Portal uses a split layout: tools[-1] + stable system prefix at long_lived_ttl - # (cross-session cache), last 2 messages at cache_ttl (within-session rolling). - # Set false to keep the legacy "system + last 3 messages" single-tier layout. - # long_lived_ttl: TTL for the cross-session prefix tier ("5m" or "1h"; default "1h"). "prompt_caching": { "cache_ttl": "5m", - "long_lived_prefix": True, - "long_lived_ttl": "1h", }, # OpenRouter-specific settings. diff --git a/run_agent.py b/run_agent.py index 7c6c62cc9a3..1c4c35c96e0 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1454,15 +1454,6 @@ class AIAgent: # 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long # sessions with >5-minute pauses between turns (#14971). self._cache_ttl = "5m" - # Long-lived prefix caching: when enabled and supported by the - # current provider, splits the system prompt into a stable prefix - # (cached cross-session at 1h TTL) and a volatile suffix - # (memory/timestamp — never cached), and attaches a 1h cache_control - # marker to the last tool in the schema array. Restricted to - # Claude on Anthropic / OpenRouter / Nous Portal; see - # ``_supports_long_lived_anthropic_cache``. - self._use_long_lived_prefix_cache = False - self._long_lived_cache_ttl = "1h" try: from hermes_cli.config import load_config as _load_pc_cfg @@ -1470,12 +1461,6 @@ class AIAgent: _ttl = _pc_cfg.get("cache_ttl", "5m") if _ttl in {"5m", "1h"}: self._cache_ttl = _ttl - _ll_enabled = _pc_cfg.get("long_lived_prefix", True) - _ll_ttl = _pc_cfg.get("long_lived_ttl", "1h") - if _ll_ttl in ("5m", "1h"): - self._long_lived_cache_ttl = _ll_ttl - if _ll_enabled and self._use_prompt_caching and self._supports_long_lived_anthropic_cache(): - self._use_long_lived_prefix_cache = True except Exception: pass @@ -2480,7 +2465,6 @@ class AIAgent: "client_kwargs": dict(self._client_kwargs), "use_prompt_caching": self._use_prompt_caching, "use_native_cache_layout": self._use_native_cache_layout, - "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache, # Context engine state that _try_activate_fallback() overwrites. # Use getattr for model/base_url/api_key/provider since plugin # engines may not have these (they're ContextCompressor-specific). @@ -2716,15 +2700,6 @@ class AIAgent: model=new_model, ) ) - self._use_long_lived_prefix_cache = bool( - self._use_prompt_caching - and self._supports_long_lived_anthropic_cache( - provider=new_provider, - base_url=self.base_url, - api_mode=api_mode, - model=new_model, - ) - ) # ── LM Studio: preload before probing context length ── self._ensure_lmstudio_runtime_loaded() @@ -2773,7 +2748,6 @@ class AIAgent: "client_kwargs": dict(self._client_kwargs), "use_prompt_caching": self._use_prompt_caching, "use_native_cache_layout": self._use_native_cache_layout, - "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache, "compressor_model": getattr(_cc, "model", self.model) if _cc else self.model, "compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url, "compressor_api_key": getattr(_cc, "api_key", "") if _cc else "", @@ -3584,80 +3558,6 @@ class AIAgent: return False, False - def _supports_long_lived_anthropic_cache( - self, - *, - provider: Optional[str] = None, - base_url: Optional[str] = None, - api_mode: Optional[str] = None, - model: Optional[str] = None, - ) -> bool: - """Decide whether the long-lived (1h cross-session) cache layout applies. - - Narrower than ``_anthropic_prompt_cache_policy`` — only enabled - for Claude models on the four endpoints whose cross-session - cache_control behavior we have explicitly validated: - - * Native Anthropic API (``api_mode == 'anthropic_messages'`` + - host ``api.anthropic.com``) - * Anthropic OAuth subscription (same transport as native API) - * OpenRouter (``base_url`` contains ``openrouter.ai``) - * Nous Portal (``base_url`` contains ``nousresearch`` — proxies - to OpenRouter, so identical wire-format) - - All four honour ``cache_control`` on both the tools array and the - first system content block, and bill cross-session cache reads at - the documented 0.1× rate. - - Other endpoints covered by the standard ``system_and_3`` policy - (third-party Anthropic gateways, MiniMax, opencode-go Qwen, etc.) - keep that layout — they support cache_control but their behavior - with mixed-TTL multi-block system content has not been validated - against this codebase. - """ - eff_provider = (provider if provider is not None else self.provider) or "" - eff_base_url = base_url if base_url is not None else (self.base_url or "") - eff_api_mode = api_mode if api_mode is not None else (self.api_mode or "") - eff_model = (model if model is not None else self.model) or "" - - model_lower = eff_model.lower() - is_claude = "claude" in model_lower - is_nous_portal = "nousresearch" in eff_base_url.lower() - - # Nous Portal Claude rides the 1h prefix_and_2 layout (Portal - # proxies to OpenRouter, which honours ttl=1h on Anthropic - # routes). Qwen does NOT — Alibaba DashScope (the upstream for - # all Qwen routes, including Portal -> OpenRouter -> Alibaba) - # documents a single ``ephemeral`` TTL of 5 minutes; ttl="1h" - # on Qwen markers is silently ignored upstream, so the - # high-value tools[-1] + system-prefix breakpoints never land - # and only the 5m rolling-window markers on the last 2 messages - # get cached. Portal Qwen still gets cache_control via - # _anthropic_prompt_cache_policy returning (True, False) — it - # just rides the standard system_and_3 5m layout instead of the - # mismatched prefix_and_2 1h layout. - if is_nous_portal and is_claude: - return True - - if not is_claude: - return False - - # Native Anthropic + Anthropic OAuth subscription - if eff_api_mode == "anthropic_messages": - if eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com": - return True - - # OpenRouter - if base_url_host_matches(eff_base_url, "openrouter.ai"): - return True - - # Nous Portal — front-ends OpenRouter behind the scenes; identical - # wire format and cache_control semantics. - if is_nous_portal: - return True - - return False - @staticmethod def _model_requires_responses_api(model: str) -> bool: """Return True for models that require the Responses API path. @@ -5906,26 +5806,19 @@ class AIAgent: """Assemble the system prompt as three ordered parts. Returns a dict with three keys: - * ``stable`` — content that is byte-stable across sessions for a - given user config: identity, tool guidance, skills prompt, + * ``stable`` — identity, tool guidance, skills prompt, environment hints, platform hints, model-family operational - guidance. Eligible for cross-session 1h prompt caching when - placed as a separate Anthropic content block (see - ``apply_anthropic_cache_control_long_lived``). - * ``context`` — context files (AGENTS.md, .cursorrules, etc.) and - caller-supplied system_message. Stable within a session but may - change between sessions when files are edited or the cwd - differs. Cached within-session via the rolling messages - breakpoint (5m TTL); not promoted to the long-lived tier so - edits don't poison the cross-session cache. - * ``volatile`` — content that changes on most turns/sessions: - memory snapshot, user profile, external memory provider block, - timestamp line. Never marked for caching. + guidance. + * ``context`` — context files (AGENTS.md, .cursorrules, etc.) + and caller-supplied system_message. + * ``volatile`` — memory snapshot, user profile, external + memory provider block, timestamp line. - Joined ``stable\\n\\ncontext\\n\\nvolatile`` produces the same - logical content the old single-string builder produced, with the - guarantee that volatile content is at the end (cache-friendly - ordering for any provider that does prefix caching). + Joined into a single string by ``_build_system_prompt`` and + cached on ``_cached_system_prompt`` for the lifetime of the + AIAgent. Hermes never re-renders parts of this string mid- + session — that's the only way to keep upstream prompt caches + warm across turns. """ # ── Stable tier ──────────────────────────────────────────────── stable_parts: List[str] = [] @@ -6127,9 +6020,10 @@ class AIAgent: Layers are ordered cache-friendly: stable identity/guidance first, then session-stable context files, then per-call volatile content - (memory, USER profile, timestamp). The split is exposed via - ``_build_system_prompt_parts`` for the long-lived prompt-caching - path (Claude on Anthropic / OpenRouter / Nous Portal). + (memory, USER profile, timestamp). The whole string is treated as + one cached block — Hermes never rebuilds or reinjects parts of it + mid-session, which is the only way to keep upstream prompt caches + warm across turns. """ parts = self._build_system_prompt_parts(system_message=system_message) joined = "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p) @@ -8896,15 +8790,6 @@ class AIAgent: model=fb_model, ) ) - self._use_long_lived_prefix_cache = bool( - self._use_prompt_caching - and self._supports_long_lived_anthropic_cache( - provider=fb_provider, - base_url=fb_base_url, - api_mode=fb_api_mode, - model=fb_model, - ) - ) # LM Studio: preload before probing the fallback's context length. self._ensure_lmstudio_runtime_loaded() @@ -8981,16 +8866,6 @@ class AIAgent: "use_native_cache_layout", self.api_mode == "anthropic_messages" and self.provider == "anthropic", ) - # Long-lived prefix flag was added later — restore False on - # snapshots predating the new field, then re-evaluate against - # the restored provider/model in case the user had it enabled. - self._use_long_lived_prefix_cache = rt.get( - "use_long_lived_prefix_cache", - bool( - self._use_prompt_caching - and self._supports_long_lived_anthropic_cache() - ), - ) # ── Rebuild client for the primary provider ── if self.api_mode == "anthropic_messages": @@ -9568,19 +9443,7 @@ class AIAgent: def _build_api_kwargs(self, api_messages: list) -> dict: """Build the keyword arguments dict for the active API mode.""" - # Resolve the tools array exactly once. When the long-lived - # prefix-cache layout is active (Claude on Anthropic / OpenRouter - # / Nous Portal), attach a 1h cache_control marker to the last - # tool — this caches the entire tools array cross-session via - # Anthropic's tools→system→messages prefix order. The function - # returns a deep copy, so self.tools is never mutated. - if self._use_long_lived_prefix_cache and self.tools: - from agent.prompt_caching import mark_tools_for_long_lived_cache - tools_for_api = mark_tools_for_long_lived_cache( - self.tools, long_lived_ttl=self._long_lived_cache_ttl, - ) - else: - tools_for_api = self.tools + tools_for_api = self.tools if self.api_mode == "anthropic_messages": _transport = self._get_transport() @@ -12440,36 +12303,21 @@ class AIAgent: # External recall context is injected into the user message, not the system # prompt, so the stable cache prefix remains unchanged. # - # When the long-lived prefix-cache layout is active (Claude on - # Anthropic / OpenRouter / Nous Portal), we build the system - # message as a *list of content blocks*: [stable, context, - # volatile, ephemeral?]. Block 0 (stable) gets the 1h - # cache_control marker further down via - # apply_anthropic_cache_control_long_lived; blocks 1-3 are - # cached only via the rolling messages window at 5m. # NOTE: Plugin context from pre_llm_call hooks is injected into the # user message (see injection block above), NOT the system prompt. # This is intentional — system prompt modifications break the prompt # cache prefix. The system prompt is reserved for Hermes internals. - if self._use_long_lived_prefix_cache: - _sys_parts = self._build_system_prompt_parts(system_message=system_message) - _sys_blocks: list = [] - if _sys_parts.get("stable"): - _sys_blocks.append({"type": "text", "text": _sys_parts["stable"]}) - if _sys_parts.get("context"): - _sys_blocks.append({"type": "text", "text": _sys_parts["context"]}) - if _sys_parts.get("volatile"): - _sys_blocks.append({"type": "text", "text": _sys_parts["volatile"]}) - if self.ephemeral_system_prompt: - _sys_blocks.append({"type": "text", "text": self.ephemeral_system_prompt}) - if _sys_blocks: - api_messages = [{"role": "system", "content": _sys_blocks}] + api_messages - else: - effective_system = active_system_prompt or "" - if self.ephemeral_system_prompt: - effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() - if effective_system: - api_messages = [{"role": "system", "content": effective_system}] + api_messages + # + # Hermes invariant: the system prompt is built ONCE per session + # (cached on ``_cached_system_prompt``) and replayed verbatim on + # every turn. We send it as a single content string so the + # bytes are byte-stable across turns and upstream prompt caches + # stay warm. + effective_system = active_system_prompt or "" + if self.ephemeral_system_prompt: + effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip() + if effective_system: + api_messages = [{"role": "system", "content": effective_system}] + api_messages # Inject ephemeral prefill messages right after the system prompt # but before conversation history. Same API-call-time-only pattern. @@ -12483,29 +12331,13 @@ class AIAgent: # gateways. Auto-detected: if ``_use_prompt_caching`` is set, # inject cache_control breakpoints (system + last 3 messages) # to reduce input token costs by ~75% on multi-turn - # conversations. Layout is chosen per endpoint by - # ``_anthropic_prompt_cache_policy``. - # - # Long-lived prefix layout (prefix_and_2): stable system block - # gets 1h marker + last 2 messages get 5m markers. Tools - # array's last entry is marked separately at API-call kwargs - # build time (see ``_build_api_kwargs`` and - # ``mark_tools_for_long_lived_cache``). + # conversations. if self._use_prompt_caching: - if self._use_long_lived_prefix_cache: - from agent.prompt_caching import apply_anthropic_cache_control_long_lived - api_messages = apply_anthropic_cache_control_long_lived( - api_messages, - long_lived_ttl=self._long_lived_cache_ttl, - rolling_ttl=self._cache_ttl, - native_anthropic=self._use_native_cache_layout, - ) - else: - api_messages = apply_anthropic_cache_control( - api_messages, - cache_ttl=self._cache_ttl, - native_anthropic=self._use_native_cache_layout, - ) + api_messages = apply_anthropic_cache_control( + api_messages, + cache_ttl=self._cache_ttl, + native_anthropic=self._use_native_cache_layout, + ) # Safety net: strip orphaned tool results / add stubs for missing # results before sending to the API. Runs unconditionally — not diff --git a/tests/agent/test_prompt_caching.py b/tests/agent/test_prompt_caching.py index 9d989571b54..f6f3e9f0a38 100644 --- a/tests/agent/test_prompt_caching.py +++ b/tests/agent/test_prompt_caching.py @@ -6,8 +6,6 @@ import pytest from agent.prompt_caching import ( _apply_cache_marker, apply_anthropic_cache_control, - apply_anthropic_cache_control_long_lived, - mark_tools_for_long_lived_cache, ) @@ -143,132 +141,3 @@ class TestApplyAnthropicCacheControl: elif "cache_control" in msg: count += 1 assert count <= 4 - - -class TestMarkToolsForLongLivedCache: - def test_returns_unchanged_for_empty_tools(self): - assert mark_tools_for_long_lived_cache(None) is None - assert mark_tools_for_long_lived_cache([]) == [] - - def test_marks_only_last_tool(self): - tools = [ - {"type": "function", "function": {"name": "a"}}, - {"type": "function", "function": {"name": "b"}}, - {"type": "function", "function": {"name": "c"}}, - ] - out = mark_tools_for_long_lived_cache(tools) - assert "cache_control" not in out[0] - assert "cache_control" not in out[1] - assert out[2]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} - - def test_does_not_mutate_input(self): - tools = [{"type": "function", "function": {"name": "a"}}] - mark_tools_for_long_lived_cache(tools) - assert "cache_control" not in tools[0] - - def test_5m_ttl_drops_ttl_field(self): - tools = [{"type": "function", "function": {"name": "a"}}] - out = mark_tools_for_long_lived_cache(tools, long_lived_ttl="5m") - assert out[0]["cache_control"] == {"type": "ephemeral"} - - -class TestApplyAnthropicCacheControlLongLived: - def test_empty_messages(self): - assert apply_anthropic_cache_control_long_lived([]) == [] - - def test_marks_first_block_of_split_system(self): - msgs = [ - {"role": "system", "content": [ - {"type": "text", "text": "STABLE"}, - {"type": "text", "text": "CONTEXT"}, - {"type": "text", "text": "VOLATILE"}, - ]}, - {"role": "user", "content": "msg1"}, - {"role": "assistant", "content": "msg2"}, - ] - out = apply_anthropic_cache_control_long_lived(msgs) - sys_blocks = out[0]["content"] - assert sys_blocks[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} - assert "cache_control" not in sys_blocks[1] - assert "cache_control" not in sys_blocks[2] - - def test_rolling_marker_on_last_2_messages(self): - msgs = [ - {"role": "system", "content": [{"type": "text", "text": "S"}]}, - {"role": "user", "content": "u1"}, - {"role": "assistant", "content": "a1"}, - {"role": "user", "content": "u2"}, - {"role": "assistant", "content": "a2"}, - ] - out = apply_anthropic_cache_control_long_lived(msgs) - - def has_marker(m): - c = m.get("content") - if isinstance(c, list) and c and isinstance(c[-1], dict): - return "cache_control" in c[-1] - return "cache_control" in m - - # u1 and a1 (older messages) should NOT be marked - assert not has_marker(out[1]) - assert not has_marker(out[2]) - # u2 and a2 (last 2) SHOULD be marked - assert has_marker(out[3]) - assert has_marker(out[4]) - - def test_rolling_marker_uses_5m_ttl(self): - msgs = [ - {"role": "system", "content": [{"type": "text", "text": "S"}]}, - {"role": "user", "content": "u1"}, - {"role": "assistant", "content": "a1"}, - ] - out = apply_anthropic_cache_control_long_lived( - msgs, long_lived_ttl="1h", rolling_ttl="5m", - ) - # Last user message: cache_control on the wrapped text part should be 5m - last = out[-1] - c = last["content"] - assert isinstance(c, list) - assert c[-1]["cache_control"] == {"type": "ephemeral"} # 5m has no ttl key - - def test_string_system_falls_back_to_envelope_marker(self): - """When the caller didn't split the system message, we still place a marker.""" - msgs = [ - {"role": "system", "content": "Single string system"}, - {"role": "user", "content": "u1"}, - ] - out = apply_anthropic_cache_control_long_lived(msgs) - sys_content = out[0]["content"] - # Wrapped into a list and the (now sole) block gets the 1h marker - assert isinstance(sys_content, list) - assert sys_content[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} - - def test_does_not_mutate_input(self): - msgs = [ - {"role": "system", "content": [{"type": "text", "text": "S"}]}, - {"role": "user", "content": "u1"}, - ] - before = copy.deepcopy(msgs) - apply_anthropic_cache_control_long_lived(msgs) - assert msgs == before - - def test_max_4_breakpoints_with_split_system(self): - msgs = [ - {"role": "system", "content": [{"type": "text", "text": "S"}, {"type": "text", "text": "V"}]}, - ] + [ - {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg{i}"} - for i in range(10) - ] - out = apply_anthropic_cache_control_long_lived(msgs) - count = 0 - for m in out: - c = m.get("content") - if isinstance(c, list): - for item in c: - if isinstance(item, dict) and "cache_control" in item: - count += 1 - elif "cache_control" in m: - count += 1 - # 1 system block + last 2 messages = 3 breakpoints from this function. - # tools[-1] is marked separately (not via this function), so a 4th - # breakpoint can be added at API-call time. - assert count == 3 diff --git a/tests/agent/test_prompt_caching_live.py b/tests/agent/test_prompt_caching_live.py deleted file mode 100644 index f72b6b9d906..00000000000 --- a/tests/agent/test_prompt_caching_live.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Live E2E: long-lived prefix caching on Claude via OpenRouter. - -Run only when LIVE_OR_KEY env var is set. Skipped under the normal hermetic -test suite (which unsets credentials). -""" -import os, sys, tempfile, time, shutil, pytest - - -# Probe for the key BEFORE conftest unsets it -_LIVE_KEY = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("LIVE_OR_KEY") -if not _LIVE_KEY: - # Try to read directly from .env - env_path = os.path.expanduser("~/.hermes/.env") - if os.path.exists(env_path): - with open(env_path) as f: - for line in f: - if line.startswith("OPENROUTER_API_KEY="): - _LIVE_KEY = line.strip().split("=", 1)[1].strip().strip('"').strip("'") - break - - -pytestmark = pytest.mark.skipif( - not _LIVE_KEY, - reason="set OPENROUTER_API_KEY (or LIVE_OR_KEY) to run live cache test", -) - - -def test_long_lived_prefix_cache_e2e_openrouter(tmp_path, monkeypatch): - """Two AIAgent runs in fresh sessions: call 1 writes cache, call 2 reads it.""" - monkeypatch.setenv("HERMES_HOME", str(tmp_path)) - # The hermetic conftest unsets OPENROUTER_API_KEY — restore for this test - monkeypatch.setenv("OPENROUTER_API_KEY", _LIVE_KEY) - - # Minimal config — but with enough toolset/guidance to exceed Anthropic's - # ~1024-token minimum-cacheable-prefix threshold. Anthropic silently - # ignores cache_control markers on small blocks. - import yaml - cfg_path = tmp_path / "config.yaml" - cfg_path.write_text(yaml.safe_dump({ - "model": {"provider": "openrouter", "default": "anthropic/claude-haiku-4.5"}, - "prompt_caching": {"long_lived_prefix": True, "long_lived_ttl": "1h", "cache_ttl": "5m"}, - "agent": {"tool_use_enforcement": True}, # adds substantial guidance text - "memory": {"provider": ""}, - "compression": {"enabled": False}, - })) - - from run_agent import AIAgent - - def make_agent(): - return AIAgent( - api_key=_LIVE_KEY, - base_url="https://openrouter.ai/api/v1", - provider="openrouter", - model="anthropic/claude-haiku-4.5", - api_mode="chat_completions", - # Use the default toolset roster — the tools array (~13k tokens - # for ~35 tools) is what carries the bulk of the cross-session - # cache value. With a tiny toolset the cached prefix can fall - # below Anthropic Haiku's 2048-token minimum cacheable size and - # the marker is silently ignored. - enabled_toolsets=None, - quiet_mode=True, - skip_context_files=True, - skip_memory=True, - save_trajectories=False, - ) - - a1 = make_agent() - assert a1._use_prompt_caching is True, "policy should enable caching for Claude on OR" - assert a1._use_long_lived_prefix_cache is True, "long-lived path should activate" - parts = a1._build_system_prompt_parts() - print(f"\nstable={len(parts['stable']):,} ctx={len(parts['context']):,} volatile={len(parts['volatile']):,} chars") - print(f"tool count: {len(a1.tools or [])}") - - # Use distinct user messages each call so OpenRouter's response cache - # doesn't short-circuit the upstream Anthropic call (we need real - # Anthropic billing visibility to verify cache_creation/cache_read). - USER_1 = "Reply with the single word ALPHA." - USER_2 = "Reply with the single word BRAVO." - - print("\n--- Call 1 (cold) ---") - r1 = a1.run_conversation(USER_1, conversation_history=[]) - print(f"final_response[:80]: {(r1.get('final_response') or '')[:80]!r}") - cr1 = a1.session_cache_read_tokens - cw1 = a1.session_cache_write_tokens - print(f"call1: cache_read={cr1} cache_write={cw1}") - - # Wait so cache settles, then fresh agent (NEW SESSION) for cross-session read - time.sleep(2) - a2 = make_agent() - assert a2.session_id != a1.session_id, "second agent must have a new session" - - print("\n--- Call 2 (warm, NEW session, different user msg) ---") - r2 = a2.run_conversation(USER_2, conversation_history=[]) - print(f"final_response[:80]: {(r2.get('final_response') or '')[:80]!r}") - cr2 = a2.session_cache_read_tokens - cw2 = a2.session_cache_write_tokens - print(f"call2: cache_read={cr2} cache_write={cw2}") - - print(f"\n=== VERDICT ===") - print(f" call1 wrote {cw1:,} cache tokens, read {cr1:,}") - print(f" call2 wrote {cw2:,} cache tokens, read {cr2:,}") - if cw1: - print(f" cross-session read fraction: cr2/cw1 = {cr2/cw1:.2%}") - - # Assertions - assert cw1 > 0, f"call 1 must write cache (got {cw1}); long-lived layout not reaching wire" - assert cr2 > 0, ( - f"call 2 must read cache cross-session (got {cr2}); " - f"stable prefix is not byte-stable across sessions" - ) - assert cr2 >= 1000, f"cache_read on call 2 ({cr2}) too small to indicate real reuse" diff --git a/tests/run_agent/test_anthropic_prompt_cache_policy.py b/tests/run_agent/test_anthropic_prompt_cache_policy.py index 3d7358e6704..ba6e54f0372 100644 --- a/tests/run_agent/test_anthropic_prompt_cache_policy.py +++ b/tests/run_agent/test_anthropic_prompt_cache_policy.py @@ -330,134 +330,3 @@ class TestExplicitOverrides: # Long-lived prefix cache policy (cross-session 1h tier) # ───────────────────────────────────────────────────────────────────── -class TestSupportsLongLivedAnthropicCache: - """Narrower than _anthropic_prompt_cache_policy — only Claude on the 4 - explicitly-validated endpoints get the long-lived layout.""" - - def test_native_anthropic_claude_supported(self): - agent = _make_agent( - provider="anthropic", - base_url="https://api.anthropic.com", - api_mode="anthropic_messages", - model="claude-sonnet-4.6", - ) - assert agent._supports_long_lived_anthropic_cache() is True - - def test_anthropic_oauth_supported(self): - # OAuth uses the same transport as native Anthropic - agent = _make_agent( - provider="anthropic", - base_url="https://api.anthropic.com", - api_mode="anthropic_messages", - model="claude-opus-4.6", - ) - assert agent._supports_long_lived_anthropic_cache() is True - - def test_openrouter_claude_supported(self): - agent = _make_agent( - provider="openrouter", - base_url="https://openrouter.ai/api/v1", - api_mode="chat_completions", - model="anthropic/claude-sonnet-4.6", - ) - assert agent._supports_long_lived_anthropic_cache() is True - - def test_nous_portal_claude_supported(self): - # Nous Portal proxies to OpenRouter — same wire format - agent = _make_agent( - provider="nous", - base_url="https://inference-api.nousresearch.com/v1", - api_mode="chat_completions", - model="anthropic/claude-opus-4.7", - ) - assert agent._supports_long_lived_anthropic_cache() is True - - def test_nous_portal_qwen_NOT_long_lived(self): - # Portal Qwen still gets cache_control markers via the standard - # system_and_3 5m layout (see _anthropic_prompt_cache_policy - # tests above), but it must NOT ride the prefix_and_2 1h layout. - # Alibaba DashScope (the upstream for every Qwen route, incl. - # Portal -> OpenRouter -> Alibaba) only supports a single - # ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently - # ignored, so the high-value tools[-1] + system-prefix - # breakpoints don't land. Stay on system_and_3 instead. - agent = _make_agent( - provider="nous", - base_url="https://inference-api.nousresearch.com/v1", - api_mode="chat_completions", - model="qwen3.6-plus", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self): - agent = _make_agent( - provider="nous", - base_url="https://inference-api.nousresearch.com/v1", - api_mode="chat_completions", - model="qwen/qwen3.6-plus", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_nous_portal_non_claude_rejected(self): - # Portal long-lived cache scope is now Claude-only. Qwen - # rejection is covered by the dedicated tests above; this - # covers everything else (gpt, etc.). - agent = _make_agent( - provider="nous", - base_url="https://inference-api.nousresearch.com/v1", - api_mode="chat_completions", - model="openai/gpt-5.4", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_openrouter_non_claude_rejected(self): - agent = _make_agent( - provider="openrouter", - base_url="https://openrouter.ai/api/v1", - api_mode="chat_completions", - model="openai/gpt-5.4", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_third_party_anthropic_gateway_rejected(self): - # MiniMax / Kimi / etc. — anthropic-wire but not in our validated list - agent = _make_agent( - provider="minimax", - base_url="https://api.minimax.io/anthropic", - api_mode="anthropic_messages", - model="minimax-m2.7", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_alibaba_dashscope_rejected(self): - agent = _make_agent( - provider="alibaba", - base_url="https://dashscope.aliyuncs.com/api/v1/anthropic", - api_mode="anthropic_messages", - model="qwen3.5-plus", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_opencode_qwen_rejected(self): - agent = _make_agent( - provider="opencode-go", - base_url="https://api.opencode-go.example/v1", - api_mode="chat_completions", - model="qwen3.6-plus", - ) - assert agent._supports_long_lived_anthropic_cache() is False - - def test_fallback_target_evaluated_independently(self): - # Starting on a non-supported provider, falling back to OpenRouter Claude - agent = _make_agent( - provider="minimax", - base_url="https://api.minimax.io/anthropic", - api_mode="anthropic_messages", - model="minimax-m2.7", - ) - assert agent._supports_long_lived_anthropic_cache( - provider="openrouter", - base_url="https://openrouter.ai/api/v1", - api_mode="chat_completions", - model="anthropic/claude-sonnet-4.6", - ) is True diff --git a/tests/test_ctx_halving_fix.py b/tests/test_ctx_halving_fix.py index afeee84878c..0dd3ca4e7eb 100644 --- a/tests/test_ctx_halving_fix.py +++ b/tests/test_ctx_halving_fix.py @@ -169,7 +169,6 @@ class TestEphemeralMaxOutputTokens: agent.reasoning_config = None agent._is_anthropic_oauth = False agent._ephemeral_max_output_tokens = None - agent._use_long_lived_prefix_cache = False compressor = MagicMock() compressor.context_length = 200_000