fix(cache): kill long-lived prefix layout — system prompt is now byte-static within a session (#24778)

The long-lived prefix-cache layout split the system prompt into stable/ context/volatile blocks and re-derived them on every API call. The volatile tier (timestamp + memory snapshot + USER profile) ticks per turn, so the system message bytes mutated mid-conversation and broke upstream prompt caches (OpenRouter, Nous Portal, Anthropic). Diagnosed via live wire-format diffing: an 8-turn conversation showed OLD layout flipping system block[1] sha mid-session at the minute boundary, dropping cached_tokens to 0 on that turn (cumulative 66.6% vs 83.3% for the single-block layout). Hermes invariant: history (system + all but the last 1-2 messages) must be static. Fix: drop the long-lived layout entirely. Single layout everywhere — system_and_3 with one cached system string built once on first turn, replayed verbatim on every subsequent turn. Loses cross-session 1h prefix caching for Claude (the feature that motivated the split), but within-session caching now actually works on every provider. Removed: - run_agent.py: _use_long_lived_prefix_cache flag, _long_lived_cache_ttl, _supports_long_lived_anthropic_cache method, the long-lived branch in run_conversation, mark_tools_for_long_lived_cache call site - agent/prompt_caching.py: apply_anthropic_cache_control_long_lived, mark_tools_for_long_lived_cache, _mark_system_stable_block helper - hermes_cli/config.py: prompt_caching.long_lived_prefix and prompt_caching.long_lived_ttl config keys - tests/agent/test_prompt_caching_live.py (entire file) - tests/agent/test_prompt_caching.py: TestMarkToolsForLongLivedCache, TestApplyAnthropicCacheControlLongLived - tests/run_agent/test_anthropic_prompt_cache_policy.py: TestSupportsLongLivedAnthropicCache Targeted tests: 62/62 pass.
2026-07-18 14:52:04 +00:00 · 2026-05-12 20:46:04 -07:00 · 2026-05-12 20:46:04 -07:00 · b06e999302
commit b06e999302
parent 80374d4dd9
8 changed files with 41 additions and 714 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -1305,9 +1305,8 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
            ),
        }
        # Forward cache_control marker when present on the OpenAI-format
-        # tool dict (set by ``mark_tools_for_long_lived_cache``). Anthropic's
-        # tools array supports cache_control on the last tool to cache the
-        # entire schema cross-session.
+        # tool dict. Anthropic's tools array supports cache_control on the
+        # last tool to cache the entire schema cross-session.
        cache_control = t.get("cache_control")
        if isinstance(cache_control, dict):
            anthropic_tool["cache_control"] = dict(cache_control)
--- a/agent/prompt_caching.py
+++ b/agent/prompt_caching.py
@ -1,25 +1,15 @@
-"""Anthropic prompt caching strategies.
+"""Anthropic prompt caching strategy.

-Two layouts:
-
-* ``system_and_3`` (default, used everywhere except the long-lived path):
-  4 cache_control breakpoints — system prompt + last 3 non-system messages.
-  All at the same TTL (5m or 1h). Reduces input token costs by ~75% on
-  multi-turn conversations within a single session.
-
-* ``prefix_and_2`` (Claude on Anthropic / OpenRouter / Nous Portal):
-  4 breakpoints split across two TTL tiers — tools[-1] (1h) +
-  stable system prefix (1h) + last 2 non-system messages (5m). The
-  long-lived prefix is byte-stable across sessions for a given user
-  config, so every fresh session reads the cached system+tools instead
-  of re-paying for them. Within-session rolling window shrinks from 3
-  messages to 2 to free the breakpoint budget.
+Single layout: ``system_and_3``. 4 cache_control breakpoints — system
+prompt + last 3 non-system messages, all at the same TTL (5m or 1h).
+Reduces input token costs by ~75% on multi-turn conversations within a
+single session.

 Pure functions -- no class state, no AIAgent dependency.
 """

 import copy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List


 def _apply_cache_marker(msg: dict, cache_marker: dict, native_anthropic: bool = False) -> None:
@ -87,115 +77,3 @@ def apply_anthropic_cache_control(
        _apply_cache_marker(messages[idx], marker, native_anthropic=native_anthropic)

    return messages
-
-
-def _mark_system_stable_block(
-    messages: List[Dict[str, Any]],
-    long_lived_marker: Dict[str, str],
-) -> bool:
-    """Mark the *first* content block of the system message with the 1h marker.
-
-    The system message is expected to have been split into multiple content
-    blocks beforehand by the caller — block[0] is the cross-session-stable
-    prefix, subsequent blocks carry context files + volatile suffix.
-    Falls back to marking the whole system message as a single block when
-    the message hasn't been split (preserves correctness on the fallback path).
-
-    Returns True when a marker was placed.
-    """
-    if not messages or messages[0].get("role") != "system":
-        return False
-
-    sys_msg = messages[0]
-    content = sys_msg.get("content")
-
-    # Already a list of blocks → mark the first block.
-    if isinstance(content, list) and content:
-        first = content[0]
-        if isinstance(first, dict):
-            first["cache_control"] = long_lived_marker
-            return True
-        return False
-
-    # String content (no split) → cannot place a stable-prefix breakpoint
-    # without changing the byte content.  Caller is responsible for
-    # splitting; if they didn't, fall through to envelope marker so we still
-    # cache *something* for this turn.
-    if isinstance(content, str) and content:
-        sys_msg["content"] = [
-            {"type": "text", "text": content, "cache_control": long_lived_marker}
-        ]
-        return True
-
-    return False
-
-
-def apply_anthropic_cache_control_long_lived(
-    api_messages: List[Dict[str, Any]],
-    long_lived_ttl: str = "1h",
-    rolling_ttl: str = "5m",
-    native_anthropic: bool = False,
-) -> List[Dict[str, Any]]:
-    """Apply prefix_and_2 caching: long-lived stable prefix + rolling window.
-
-    Layout (4 breakpoints total):
-      * Stable system prefix (block[0]) → ``long_lived_ttl`` TTL
-      * Last 2 non-system messages → ``rolling_ttl`` TTL each
-
-    NOTE: this function does NOT mark the tools array. Tools cache_control
-    is attached separately (see ``mark_tools_for_long_lived_cache``) because
-    tools live outside the messages list in the API payload.
-
-    The caller MUST have split the system message into ordered content
-    blocks where block[0] is the cross-session-stable portion. If the system
-    message is still a single string, it is wrapped into a single block and
-    marked — this is correct, just less effective (the volatile suffix is
-    not isolated, so the prefix invalidates per-session).
-
-    Returns:
-        Deep copy of messages with cache_control breakpoints injected.
-    """
-    messages = copy.deepcopy(api_messages)
-    if not messages:
-        return messages
-
-    long_marker = _build_marker(long_lived_ttl)
-    rolling_marker = _build_marker(rolling_ttl)
-
-    placed_prefix = _mark_system_stable_block(messages, long_marker)
-
-    # Reserve 1 breakpoint for the system prefix (when placed); spend the
-    # remaining 3 on the rolling tail.  Anthropic max is 4 total —
-    # tools[-1] (when marked) consumes the 4th, so we cap rolling at 2 here.
-    rolling_budget = 2 if placed_prefix else 3
-    non_sys = [i for i in range(len(messages)) if messages[i].get("role") != "system"]
-    for idx in non_sys[-rolling_budget:]:
-        _apply_cache_marker(messages[idx], rolling_marker, native_anthropic=native_anthropic)
-
-    return messages
-
-
-def mark_tools_for_long_lived_cache(
-    tools: Optional[List[Dict[str, Any]]],
-    long_lived_ttl: str = "1h",
-) -> Optional[List[Dict[str, Any]]]:
-    """Attach cache_control to the last tool in the OpenAI-format tools list.
-
-    Anthropic prefix-cache order is ``tools → system → messages``.  Marking
-    the last tool dict caches the entire tools array (Anthropic's docs:
-    "the marker is placed on the last block you want included in the cached
-    prefix").  Marker is preserved across the OpenAI-wire boundary on
-    OpenRouter and Nous Portal (which proxies to OpenRouter); on native
-    Anthropic the marker is forwarded by ``convert_tools_to_anthropic``.
-
-    Returns a deep copy of the tools list with the marker attached, or the
-    input unchanged when tools is empty/None.  Pure function — does not
-    mutate the input.
-    """
-    if not tools:
-        return tools
-    out = copy.deepcopy(tools)
-    last = out[-1]
-    if isinstance(last, dict):
-        last["cache_control"] = _build_marker(long_lived_ttl)
-    return out
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -735,15 +735,8 @@ DEFAULT_CONFIG = {

    # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
    # cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
-    # long_lived_prefix: when true (default), Claude on Anthropic / OpenRouter / Nous
-    #   Portal uses a split layout: tools[-1] + stable system prefix at long_lived_ttl
-    #   (cross-session cache), last 2 messages at cache_ttl (within-session rolling).
-    #   Set false to keep the legacy "system + last 3 messages" single-tier layout.
-    # long_lived_ttl: TTL for the cross-session prefix tier ("5m" or "1h"; default "1h").
    "prompt_caching": {
        "cache_ttl": "5m",
-        "long_lived_prefix": True,
-        "long_lived_ttl": "1h",
    },

    # OpenRouter-specific settings.
--- a/run_agent.py
+++ b/run_agent.py
@ -1454,15 +1454,6 @@ class AIAgent:
        # 1h tier costs 2x on write vs 1.25x for 5m, but amortizes across long
        # sessions with >5-minute pauses between turns (#14971).
        self._cache_ttl = "5m"
-        # Long-lived prefix caching: when enabled and supported by the
-        # current provider, splits the system prompt into a stable prefix
-        # (cached cross-session at 1h TTL) and a volatile suffix
-        # (memory/timestamp — never cached), and attaches a 1h cache_control
-        # marker to the last tool in the schema array.  Restricted to
-        # Claude on Anthropic / OpenRouter / Nous Portal; see
-        # ``_supports_long_lived_anthropic_cache``.
-        self._use_long_lived_prefix_cache = False
-        self._long_lived_cache_ttl = "1h"
        try:
            from hermes_cli.config import load_config as _load_pc_cfg

@ -1470,12 +1461,6 @@ class AIAgent:
            _ttl = _pc_cfg.get("cache_ttl", "5m")
            if _ttl in {"5m", "1h"}:
                self._cache_ttl = _ttl
-            _ll_enabled = _pc_cfg.get("long_lived_prefix", True)
-            _ll_ttl = _pc_cfg.get("long_lived_ttl", "1h")
-            if _ll_ttl in ("5m", "1h"):
-                self._long_lived_cache_ttl = _ll_ttl
-            if _ll_enabled and self._use_prompt_caching and self._supports_long_lived_anthropic_cache():
-                self._use_long_lived_prefix_cache = True
        except Exception:
            pass

@ -2480,7 +2465,6 @@ class AIAgent:
            "client_kwargs": dict(self._client_kwargs),
            "use_prompt_caching": self._use_prompt_caching,
            "use_native_cache_layout": self._use_native_cache_layout,
-            "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
            # Context engine state that _try_activate_fallback() overwrites.
            # Use getattr for model/base_url/api_key/provider since plugin
            # engines may not have these (they're ContextCompressor-specific).
@ -2716,15 +2700,6 @@ class AIAgent:
                model=new_model,
            )
        )
-        self._use_long_lived_prefix_cache = bool(
-            self._use_prompt_caching
-            and self._supports_long_lived_anthropic_cache(
-                provider=new_provider,
-                base_url=self.base_url,
-                api_mode=api_mode,
-                model=new_model,
-            )
-        )

        # ── LM Studio: preload before probing context length ──
        self._ensure_lmstudio_runtime_loaded()
@ -2773,7 +2748,6 @@ class AIAgent:
            "client_kwargs": dict(self._client_kwargs),
            "use_prompt_caching": self._use_prompt_caching,
            "use_native_cache_layout": self._use_native_cache_layout,
-            "use_long_lived_prefix_cache": self._use_long_lived_prefix_cache,
            "compressor_model": getattr(_cc, "model", self.model) if _cc else self.model,
            "compressor_base_url": getattr(_cc, "base_url", self.base_url) if _cc else self.base_url,
            "compressor_api_key": getattr(_cc, "api_key", "") if _cc else "",
@ -3584,80 +3558,6 @@ class AIAgent:

        return False, False

-    def _supports_long_lived_anthropic_cache(
-        self,
-        *,
-        provider: Optional[str] = None,
-        base_url: Optional[str] = None,
-        api_mode: Optional[str] = None,
-        model: Optional[str] = None,
-    ) -> bool:
-        """Decide whether the long-lived (1h cross-session) cache layout applies.
-
-        Narrower than ``_anthropic_prompt_cache_policy`` — only enabled
-        for Claude models on the four endpoints whose cross-session
-        cache_control behavior we have explicitly validated:
-
-          * Native Anthropic API (``api_mode == 'anthropic_messages'`` +
-            host ``api.anthropic.com``)
-          * Anthropic OAuth subscription (same transport as native API)
-          * OpenRouter (``base_url`` contains ``openrouter.ai``)
-          * Nous Portal (``base_url`` contains ``nousresearch`` — proxies
-            to OpenRouter, so identical wire-format)
-
-        All four honour ``cache_control`` on both the tools array and the
-        first system content block, and bill cross-session cache reads at
-        the documented 0.1× rate.
-
-        Other endpoints covered by the standard ``system_and_3`` policy
-        (third-party Anthropic gateways, MiniMax, opencode-go Qwen, etc.)
-        keep that layout — they support cache_control but their behavior
-        with mixed-TTL multi-block system content has not been validated
-        against this codebase.
-        """
-        eff_provider = (provider if provider is not None else self.provider) or ""
-        eff_base_url = base_url if base_url is not None else (self.base_url or "")
-        eff_api_mode = api_mode if api_mode is not None else (self.api_mode or "")
-        eff_model = (model if model is not None else self.model) or ""
-
-        model_lower = eff_model.lower()
-        is_claude = "claude" in model_lower
-        is_nous_portal = "nousresearch" in eff_base_url.lower()
-
-        # Nous Portal Claude rides the 1h prefix_and_2 layout (Portal
-        # proxies to OpenRouter, which honours ttl=1h on Anthropic
-        # routes).  Qwen does NOT — Alibaba DashScope (the upstream for
-        # all Qwen routes, including Portal -> OpenRouter -> Alibaba)
-        # documents a single ``ephemeral`` TTL of 5 minutes; ttl="1h"
-        # on Qwen markers is silently ignored upstream, so the
-        # high-value tools[-1] + system-prefix breakpoints never land
-        # and only the 5m rolling-window markers on the last 2 messages
-        # get cached.  Portal Qwen still gets cache_control via
-        # _anthropic_prompt_cache_policy returning (True, False) — it
-        # just rides the standard system_and_3 5m layout instead of the
-        # mismatched prefix_and_2 1h layout.
-        if is_nous_portal and is_claude:
-            return True
-
-        if not is_claude:
-            return False
-
-        # Native Anthropic + Anthropic OAuth subscription
-        if eff_api_mode == "anthropic_messages":
-            if eff_provider == "anthropic" or base_url_hostname(eff_base_url) == "api.anthropic.com":
-                return True
-
-        # OpenRouter
-        if base_url_host_matches(eff_base_url, "openrouter.ai"):
-            return True
-
-        # Nous Portal — front-ends OpenRouter behind the scenes; identical
-        # wire format and cache_control semantics.
-        if is_nous_portal:
-            return True
-
-        return False
-
    @staticmethod
    def _model_requires_responses_api(model: str) -> bool:
        """Return True for models that require the Responses API path.
@ -5906,26 +5806,19 @@ class AIAgent:
        """Assemble the system prompt as three ordered parts.

        Returns a dict with three keys:
-          * ``stable``  — content that is byte-stable across sessions for a
-            given user config: identity, tool guidance, skills prompt,
+          * ``stable``   — identity, tool guidance, skills prompt,
            environment hints, platform hints, model-family operational
-            guidance.  Eligible for cross-session 1h prompt caching when
-            placed as a separate Anthropic content block (see
-            ``apply_anthropic_cache_control_long_lived``).
-          * ``context`` — context files (AGENTS.md, .cursorrules, etc.) and
-            caller-supplied system_message.  Stable within a session but may
-            change between sessions when files are edited or the cwd
-            differs.  Cached within-session via the rolling messages
-            breakpoint (5m TTL); not promoted to the long-lived tier so
-            edits don't poison the cross-session cache.
-          * ``volatile`` — content that changes on most turns/sessions:
-            memory snapshot, user profile, external memory provider block,
-            timestamp line.  Never marked for caching.
+            guidance.
+          * ``context``  — context files (AGENTS.md, .cursorrules, etc.)
+            and caller-supplied system_message.
+          * ``volatile`` — memory snapshot, user profile, external
+            memory provider block, timestamp line.

-        Joined ``stable\\n\\ncontext\\n\\nvolatile`` produces the same
-        logical content the old single-string builder produced, with the
-        guarantee that volatile content is at the end (cache-friendly
-        ordering for any provider that does prefix caching).
+        Joined into a single string by ``_build_system_prompt`` and
+        cached on ``_cached_system_prompt`` for the lifetime of the
+        AIAgent.  Hermes never re-renders parts of this string mid-
+        session — that's the only way to keep upstream prompt caches
+        warm across turns.
        """
        # ── Stable tier ────────────────────────────────────────────────
        stable_parts: List[str] = []
@ -6127,9 +6020,10 @@ class AIAgent:

        Layers are ordered cache-friendly: stable identity/guidance first,
        then session-stable context files, then per-call volatile content
-        (memory, USER profile, timestamp). The split is exposed via
-        ``_build_system_prompt_parts`` for the long-lived prompt-caching
-        path (Claude on Anthropic / OpenRouter / Nous Portal).
+        (memory, USER profile, timestamp).  The whole string is treated as
+        one cached block — Hermes never rebuilds or reinjects parts of it
+        mid-session, which is the only way to keep upstream prompt caches
+        warm across turns.
        """
        parts = self._build_system_prompt_parts(system_message=system_message)
        joined = "\n\n".join(p for p in (parts["stable"], parts["context"], parts["volatile"]) if p)
@ -8896,15 +8790,6 @@ class AIAgent:
                    model=fb_model,
                )
            )
-            self._use_long_lived_prefix_cache = bool(
-                self._use_prompt_caching
-                and self._supports_long_lived_anthropic_cache(
-                    provider=fb_provider,
-                    base_url=fb_base_url,
-                    api_mode=fb_api_mode,
-                    model=fb_model,
-                )
-            )

            # LM Studio: preload before probing the fallback's context length.
            self._ensure_lmstudio_runtime_loaded()
@ -8981,16 +8866,6 @@ class AIAgent:
                "use_native_cache_layout",
                self.api_mode == "anthropic_messages" and self.provider == "anthropic",
            )
-            # Long-lived prefix flag was added later — restore False on
-            # snapshots predating the new field, then re-evaluate against
-            # the restored provider/model in case the user had it enabled.
-            self._use_long_lived_prefix_cache = rt.get(
-                "use_long_lived_prefix_cache",
-                bool(
-                    self._use_prompt_caching
-                    and self._supports_long_lived_anthropic_cache()
-                ),
-            )

            # ── Rebuild client for the primary provider ──
            if self.api_mode == "anthropic_messages":
@ -9568,19 +9443,7 @@ class AIAgent:

    def _build_api_kwargs(self, api_messages: list) -> dict:
        """Build the keyword arguments dict for the active API mode."""
-        # Resolve the tools array exactly once. When the long-lived
-        # prefix-cache layout is active (Claude on Anthropic / OpenRouter
-        # / Nous Portal), attach a 1h cache_control marker to the last
-        # tool — this caches the entire tools array cross-session via
-        # Anthropic's tools→system→messages prefix order. The function
-        # returns a deep copy, so self.tools is never mutated.
-        if self._use_long_lived_prefix_cache and self.tools:
-            from agent.prompt_caching import mark_tools_for_long_lived_cache
-            tools_for_api = mark_tools_for_long_lived_cache(
-                self.tools, long_lived_ttl=self._long_lived_cache_ttl,
-            )
-        else:
-            tools_for_api = self.tools
+        tools_for_api = self.tools

        if self.api_mode == "anthropic_messages":
            _transport = self._get_transport()
@ -12440,36 +12303,21 @@ class AIAgent:
            # External recall context is injected into the user message, not the system
            # prompt, so the stable cache prefix remains unchanged.
            #
-            # When the long-lived prefix-cache layout is active (Claude on
-            # Anthropic / OpenRouter / Nous Portal), we build the system
-            # message as a *list of content blocks*: [stable, context,
-            # volatile, ephemeral?].  Block 0 (stable) gets the 1h
-            # cache_control marker further down via
-            # apply_anthropic_cache_control_long_lived; blocks 1-3 are
-            # cached only via the rolling messages window at 5m.
            # NOTE: Plugin context from pre_llm_call hooks is injected into the
            # user message (see injection block above), NOT the system prompt.
            # This is intentional — system prompt modifications break the prompt
            # cache prefix.  The system prompt is reserved for Hermes internals.
-            if self._use_long_lived_prefix_cache:
-                _sys_parts = self._build_system_prompt_parts(system_message=system_message)
-                _sys_blocks: list = []
-                if _sys_parts.get("stable"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["stable"]})
-                if _sys_parts.get("context"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["context"]})
-                if _sys_parts.get("volatile"):
-                    _sys_blocks.append({"type": "text", "text": _sys_parts["volatile"]})
-                if self.ephemeral_system_prompt:
-                    _sys_blocks.append({"type": "text", "text": self.ephemeral_system_prompt})
-                if _sys_blocks:
-                    api_messages = [{"role": "system", "content": _sys_blocks}] + api_messages
-            else:
-                effective_system = active_system_prompt or ""
-                if self.ephemeral_system_prompt:
-                    effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-                if effective_system:
-                    api_messages = [{"role": "system", "content": effective_system}] + api_messages
+            #
+            # Hermes invariant: the system prompt is built ONCE per session
+            # (cached on ``_cached_system_prompt``) and replayed verbatim on
+            # every turn.  We send it as a single content string so the
+            # bytes are byte-stable across turns and upstream prompt caches
+            # stay warm.
+            effective_system = active_system_prompt or ""
+            if self.ephemeral_system_prompt:
+                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
+            if effective_system:
+                api_messages = [{"role": "system", "content": effective_system}] + api_messages

            # Inject ephemeral prefill messages right after the system prompt
            # but before conversation history. Same API-call-time-only pattern.
@ -12483,29 +12331,13 @@ class AIAgent:
            # gateways. Auto-detected: if ``_use_prompt_caching`` is set,
            # inject cache_control breakpoints (system + last 3 messages)
            # to reduce input token costs by ~75% on multi-turn
-            # conversations. Layout is chosen per endpoint by
-            # ``_anthropic_prompt_cache_policy``.
-            #
-            # Long-lived prefix layout (prefix_and_2): stable system block
-            # gets 1h marker + last 2 messages get 5m markers. Tools
-            # array's last entry is marked separately at API-call kwargs
-            # build time (see ``_build_api_kwargs`` and
-            # ``mark_tools_for_long_lived_cache``).
+            # conversations.
            if self._use_prompt_caching:
-                if self._use_long_lived_prefix_cache:
-                    from agent.prompt_caching import apply_anthropic_cache_control_long_lived
-                    api_messages = apply_anthropic_cache_control_long_lived(
-                        api_messages,
-                        long_lived_ttl=self._long_lived_cache_ttl,
-                        rolling_ttl=self._cache_ttl,
-                        native_anthropic=self._use_native_cache_layout,
-                    )
-                else:
-                    api_messages = apply_anthropic_cache_control(
-                        api_messages,
-                        cache_ttl=self._cache_ttl,
-                        native_anthropic=self._use_native_cache_layout,
-                    )
+                api_messages = apply_anthropic_cache_control(
+                    api_messages,
+                    cache_ttl=self._cache_ttl,
+                    native_anthropic=self._use_native_cache_layout,
+                )

            # Safety net: strip orphaned tool results / add stubs for missing
            # results before sending to the API.  Runs unconditionally — not
--- a/tests/agent/test_prompt_caching.py
+++ b/tests/agent/test_prompt_caching.py
@ -6,8 +6,6 @@ import pytest
 from agent.prompt_caching import (
    _apply_cache_marker,
    apply_anthropic_cache_control,
-    apply_anthropic_cache_control_long_lived,
-    mark_tools_for_long_lived_cache,
 )


@ -143,132 +141,3 @@ class TestApplyAnthropicCacheControl:
            elif "cache_control" in msg:
                count += 1
        assert count <= 4
-
-
-class TestMarkToolsForLongLivedCache:
-    def test_returns_unchanged_for_empty_tools(self):
-        assert mark_tools_for_long_lived_cache(None) is None
-        assert mark_tools_for_long_lived_cache([]) == []
-
-    def test_marks_only_last_tool(self):
-        tools = [
-            {"type": "function", "function": {"name": "a"}},
-            {"type": "function", "function": {"name": "b"}},
-            {"type": "function", "function": {"name": "c"}},
-        ]
-        out = mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in out[0]
-        assert "cache_control" not in out[1]
-        assert out[2]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        mark_tools_for_long_lived_cache(tools)
-        assert "cache_control" not in tools[0]
-
-    def test_5m_ttl_drops_ttl_field(self):
-        tools = [{"type": "function", "function": {"name": "a"}}]
-        out = mark_tools_for_long_lived_cache(tools, long_lived_ttl="5m")
-        assert out[0]["cache_control"] == {"type": "ephemeral"}
-
-
-class TestApplyAnthropicCacheControlLongLived:
-    def test_empty_messages(self):
-        assert apply_anthropic_cache_control_long_lived([]) == []
-
-    def test_marks_first_block_of_split_system(self):
-        msgs = [
-            {"role": "system", "content": [
-                {"type": "text", "text": "STABLE"},
-                {"type": "text", "text": "CONTEXT"},
-                {"type": "text", "text": "VOLATILE"},
-            ]},
-            {"role": "user", "content": "msg1"},
-            {"role": "assistant", "content": "msg2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_blocks = out[0]["content"]
-        assert sys_blocks[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-        assert "cache_control" not in sys_blocks[1]
-        assert "cache_control" not in sys_blocks[2]
-
-    def test_rolling_marker_on_last_2_messages(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-            {"role": "user", "content": "u2"},
-            {"role": "assistant", "content": "a2"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-
-        def has_marker(m):
-            c = m.get("content")
-            if isinstance(c, list) and c and isinstance(c[-1], dict):
-                return "cache_control" in c[-1]
-            return "cache_control" in m
-
-        # u1 and a1 (older messages) should NOT be marked
-        assert not has_marker(out[1])
-        assert not has_marker(out[2])
-        # u2 and a2 (last 2) SHOULD be marked
-        assert has_marker(out[3])
-        assert has_marker(out[4])
-
-    def test_rolling_marker_uses_5m_ttl(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-            {"role": "assistant", "content": "a1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(
-            msgs, long_lived_ttl="1h", rolling_ttl="5m",
-        )
-        # Last user message: cache_control on the wrapped text part should be 5m
-        last = out[-1]
-        c = last["content"]
-        assert isinstance(c, list)
-        assert c[-1]["cache_control"] == {"type": "ephemeral"}  # 5m has no ttl key
-
-    def test_string_system_falls_back_to_envelope_marker(self):
-        """When the caller didn't split the system message, we still place a marker."""
-        msgs = [
-            {"role": "system", "content": "Single string system"},
-            {"role": "user", "content": "u1"},
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        sys_content = out[0]["content"]
-        # Wrapped into a list and the (now sole) block gets the 1h marker
-        assert isinstance(sys_content, list)
-        assert sys_content[0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"}
-
-    def test_does_not_mutate_input(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}]},
-            {"role": "user", "content": "u1"},
-        ]
-        before = copy.deepcopy(msgs)
-        apply_anthropic_cache_control_long_lived(msgs)
-        assert msgs == before
-
-    def test_max_4_breakpoints_with_split_system(self):
-        msgs = [
-            {"role": "system", "content": [{"type": "text", "text": "S"}, {"type": "text", "text": "V"}]},
-        ] + [
-            {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg{i}"}
-            for i in range(10)
-        ]
-        out = apply_anthropic_cache_control_long_lived(msgs)
-        count = 0
-        for m in out:
-            c = m.get("content")
-            if isinstance(c, list):
-                for item in c:
-                    if isinstance(item, dict) and "cache_control" in item:
-                        count += 1
-            elif "cache_control" in m:
-                count += 1
-        # 1 system block + last 2 messages = 3 breakpoints from this function.
-        # tools[-1] is marked separately (not via this function), so a 4th
-        # breakpoint can be added at API-call time.
-        assert count == 3
--- a/tests/agent/test_prompt_caching_live.py
+++ b/tests/agent/test_prompt_caching_live.py
@ -1,112 +0,0 @@
-"""Live E2E: long-lived prefix caching on Claude via OpenRouter.
-
-Run only when LIVE_OR_KEY env var is set. Skipped under the normal hermetic
-test suite (which unsets credentials).
-"""
-import os, sys, tempfile, time, shutil, pytest
-
-
-# Probe for the key BEFORE conftest unsets it
-_LIVE_KEY = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("LIVE_OR_KEY")
-if not _LIVE_KEY:
-    # Try to read directly from .env
-    env_path = os.path.expanduser("~/.hermes/.env")
-    if os.path.exists(env_path):
-        with open(env_path) as f:
-            for line in f:
-                if line.startswith("OPENROUTER_API_KEY="):
-                    _LIVE_KEY = line.strip().split("=", 1)[1].strip().strip('"').strip("'")
-                    break
-
-
-pytestmark = pytest.mark.skipif(
-    not _LIVE_KEY,
-    reason="set OPENROUTER_API_KEY (or LIVE_OR_KEY) to run live cache test",
-)
-
-
-def test_long_lived_prefix_cache_e2e_openrouter(tmp_path, monkeypatch):
-    """Two AIAgent runs in fresh sessions: call 1 writes cache, call 2 reads it."""
-    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-    # The hermetic conftest unsets OPENROUTER_API_KEY — restore for this test
-    monkeypatch.setenv("OPENROUTER_API_KEY", _LIVE_KEY)
-
-    # Minimal config — but with enough toolset/guidance to exceed Anthropic's
-    # ~1024-token minimum-cacheable-prefix threshold. Anthropic silently
-    # ignores cache_control markers on small blocks.
-    import yaml
-    cfg_path = tmp_path / "config.yaml"
-    cfg_path.write_text(yaml.safe_dump({
-        "model": {"provider": "openrouter", "default": "anthropic/claude-haiku-4.5"},
-        "prompt_caching": {"long_lived_prefix": True, "long_lived_ttl": "1h", "cache_ttl": "5m"},
-        "agent": {"tool_use_enforcement": True},   # adds substantial guidance text
-        "memory": {"provider": ""},
-        "compression": {"enabled": False},
-    }))
-
-    from run_agent import AIAgent
-
-    def make_agent():
-        return AIAgent(
-            api_key=_LIVE_KEY,
-            base_url="https://openrouter.ai/api/v1",
-            provider="openrouter",
-            model="anthropic/claude-haiku-4.5",
-            api_mode="chat_completions",
-            # Use the default toolset roster — the tools array (~13k tokens
-            # for ~35 tools) is what carries the bulk of the cross-session
-            # cache value. With a tiny toolset the cached prefix can fall
-            # below Anthropic Haiku's 2048-token minimum cacheable size and
-            # the marker is silently ignored.
-            enabled_toolsets=None,
-            quiet_mode=True,
-            skip_context_files=True,
-            skip_memory=True,
-            save_trajectories=False,
-        )
-
-    a1 = make_agent()
-    assert a1._use_prompt_caching is True, "policy should enable caching for Claude on OR"
-    assert a1._use_long_lived_prefix_cache is True, "long-lived path should activate"
-    parts = a1._build_system_prompt_parts()
-    print(f"\nstable={len(parts['stable']):,} ctx={len(parts['context']):,} volatile={len(parts['volatile']):,} chars")
-    print(f"tool count: {len(a1.tools or [])}")
-
-    # Use distinct user messages each call so OpenRouter's response cache
-    # doesn't short-circuit the upstream Anthropic call (we need real
-    # Anthropic billing visibility to verify cache_creation/cache_read).
-    USER_1 = "Reply with the single word ALPHA."
-    USER_2 = "Reply with the single word BRAVO."
-
-    print("\n--- Call 1 (cold) ---")
-    r1 = a1.run_conversation(USER_1, conversation_history=[])
-    print(f"final_response[:80]: {(r1.get('final_response') or '')[:80]!r}")
-    cr1 = a1.session_cache_read_tokens
-    cw1 = a1.session_cache_write_tokens
-    print(f"call1: cache_read={cr1} cache_write={cw1}")
-
-    # Wait so cache settles, then fresh agent (NEW SESSION) for cross-session read
-    time.sleep(2)
-    a2 = make_agent()
-    assert a2.session_id != a1.session_id, "second agent must have a new session"
-
-    print("\n--- Call 2 (warm, NEW session, different user msg) ---")
-    r2 = a2.run_conversation(USER_2, conversation_history=[])
-    print(f"final_response[:80]: {(r2.get('final_response') or '')[:80]!r}")
-    cr2 = a2.session_cache_read_tokens
-    cw2 = a2.session_cache_write_tokens
-    print(f"call2: cache_read={cr2} cache_write={cw2}")
-
-    print(f"\n=== VERDICT ===")
-    print(f"  call1 wrote {cw1:,} cache tokens, read {cr1:,}")
-    print(f"  call2 wrote {cw2:,} cache tokens, read {cr2:,}")
-    if cw1:
-        print(f"  cross-session read fraction: cr2/cw1 = {cr2/cw1:.2%}")
-
-    # Assertions
-    assert cw1 > 0, f"call 1 must write cache (got {cw1}); long-lived layout not reaching wire"
-    assert cr2 > 0, (
-        f"call 2 must read cache cross-session (got {cr2}); "
-        f"stable prefix is not byte-stable across sessions"
-    )
-    assert cr2 >= 1000, f"cache_read on call 2 ({cr2}) too small to indicate real reuse"
--- a/tests/run_agent/test_anthropic_prompt_cache_policy.py
+++ b/tests/run_agent/test_anthropic_prompt_cache_policy.py
@ -330,134 +330,3 @@ class TestExplicitOverrides:
 # Long-lived prefix cache policy (cross-session 1h tier)
 # ─────────────────────────────────────────────────────────────────────

-class TestSupportsLongLivedAnthropicCache:
-    """Narrower than _anthropic_prompt_cache_policy — only Claude on the 4
-    explicitly-validated endpoints get the long-lived layout."""
-
-    def test_native_anthropic_claude_supported(self):
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_anthropic_oauth_supported(self):
-        # OAuth uses the same transport as native Anthropic
-        agent = _make_agent(
-            provider="anthropic",
-            base_url="https://api.anthropic.com",
-            api_mode="anthropic_messages",
-            model="claude-opus-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_openrouter_claude_supported(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_claude_supported(self):
-        # Nous Portal proxies to OpenRouter — same wire format
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-opus-4.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is True
-
-    def test_nous_portal_qwen_NOT_long_lived(self):
-        # Portal Qwen still gets cache_control markers via the standard
-        # system_and_3 5m layout (see _anthropic_prompt_cache_policy
-        # tests above), but it must NOT ride the prefix_and_2 1h layout.
-        # Alibaba DashScope (the upstream for every Qwen route, incl.
-        # Portal -> OpenRouter -> Alibaba) only supports a single
-        # ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently
-        # ignored, so the high-value tools[-1] + system-prefix
-        # breakpoints don't land. Stay on system_and_3 instead.
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self):
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="qwen/qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_nous_portal_non_claude_rejected(self):
-        # Portal long-lived cache scope is now Claude-only. Qwen
-        # rejection is covered by the dedicated tests above; this
-        # covers everything else (gpt, etc.).
-        agent = _make_agent(
-            provider="nous",
-            base_url="https://inference-api.nousresearch.com/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_openrouter_non_claude_rejected(self):
-        agent = _make_agent(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="openai/gpt-5.4",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_third_party_anthropic_gateway_rejected(self):
-        # MiniMax / Kimi / etc. — anthropic-wire but not in our validated list
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_alibaba_dashscope_rejected(self):
-        agent = _make_agent(
-            provider="alibaba",
-            base_url="https://dashscope.aliyuncs.com/api/v1/anthropic",
-            api_mode="anthropic_messages",
-            model="qwen3.5-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_opencode_qwen_rejected(self):
-        agent = _make_agent(
-            provider="opencode-go",
-            base_url="https://api.opencode-go.example/v1",
-            api_mode="chat_completions",
-            model="qwen3.6-plus",
-        )
-        assert agent._supports_long_lived_anthropic_cache() is False
-
-    def test_fallback_target_evaluated_independently(self):
-        # Starting on a non-supported provider, falling back to OpenRouter Claude
-        agent = _make_agent(
-            provider="minimax",
-            base_url="https://api.minimax.io/anthropic",
-            api_mode="anthropic_messages",
-            model="minimax-m2.7",
-        )
-        assert agent._supports_long_lived_anthropic_cache(
-            provider="openrouter",
-            base_url="https://openrouter.ai/api/v1",
-            api_mode="chat_completions",
-            model="anthropic/claude-sonnet-4.6",
-        ) is True
--- a/tests/test_ctx_halving_fix.py
+++ b/tests/test_ctx_halving_fix.py
@ -169,7 +169,6 @@ class TestEphemeralMaxOutputTokens:
        agent.reasoning_config = None
        agent._is_anthropic_oauth = False
        agent._ephemeral_max_output_tokens = None
-        agent._use_long_lived_prefix_cache = False

        compressor = MagicMock()
        compressor.context_length = 200_000