mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Cuts input cost for first-turn Claude requests by ~85-90% on subsequent
sessions within an hour. Tools array (~13k tokens for default toolset) +
stable system prefix (~5-8k tokens) get a 1h cache_control marker; the
volatile suffix (memory, USER profile, timestamp, session id) sits in a
separate non-cached block at the end so it doesn't poison the cross-session
prefix when it changes.
Provider gate: Claude on native Anthropic (incl. OAuth subscription),
OpenRouter, and Nous Portal (which proxies to OpenRouter). All other
providers keep today's system_and_3 layout unchanged.
Layout (4 cache_control breakpoints, Anthropic max):
1. tools[-1] -> 1h (cross-session)
2. system content[0] -> 1h (cross-session, stable prefix)
3. messages[-2] -> 5m (within-session rolling)
4. messages[-1] -> 5m (within-session rolling)
Within-session rolling shrinks from 3 messages to 2 to free the breakpoint
budget. On Claude with realistic tool loadouts the long-lived tier carries
the bulk of cross-session value anyway.
System prompt is now always assembled cache-friendly: stable identity /
guidance / skills / platform hints first, then session-stable context
files (AGENTS.md, .cursorrules), then per-call volatile content. Old
single-string callers see the same logical content (same join order),
just reordered so volatile lives at the end.
Config knobs (defaults shown):
prompt_caching:
cache_ttl: "5m" # rolling-window TTL (unchanged)
long_lived_prefix: true # opt-out switch
long_lived_ttl: "1h" # cross-session prefix TTL
Live E2E (tests/agent/test_prompt_caching_live.py, gated on
OPENROUTER_API_KEY) on anthropic/claude-haiku-4.5 with default toolset:
Call 1 (cold): cache_write=13,415 cache_read=0
Call 2 (NEW agent + msg): cache_write=391 cache_read=13,025
Cross-session reuse: 97.09%
Implementation:
* agent/prompt_caching.py: new apply_anthropic_cache_control_long_lived()
+ mark_tools_for_long_lived_cache(); existing apply_anthropic_cache_control()
preserved verbatim for the fallback path.
* agent/anthropic_adapter.py: convert_tools_to_anthropic() now forwards
cache_control onto each Anthropic-format tool dict.
* run_agent.py: _build_system_prompt_parts() returns the 3-tier dict;
_build_system_prompt() joins them (backward compatible).
_supports_long_lived_anthropic_cache() policy added next to the existing
_anthropic_prompt_cache_policy() (which now also recognises Nous Portal
Claude — pre-existing gap fixed in passing).
_build_api_kwargs() resolves tools_for_api once and propagates the
marker through all four build paths (anthropic_messages, bedrock,
codex_responses, profile/legacy chat completions).
Long-lived flag plumbed into the runtime snapshot/restore + model-switch
+ fallback-promotion paths.
Tests:
* tests/agent/test_prompt_caching.py: +8 tests (TestMarkToolsForLongLivedCache,
TestApplyAnthropicCacheControlLongLived).
* tests/run_agent/test_anthropic_prompt_cache_policy.py: +9 tests
(TestSupportsLongLivedAnthropicCache matrix across 8 endpoint classes
+ a fallback-target case).
* tests/agent/test_prompt_caching_live.py: new live E2E (skipif when
OPENROUTER_API_KEY is unset; runs outside the hermetic suite).
* Targeted suites: 327/327 pass (caching/adapter/policy/builder).
* tests/agent/ + tests/run_agent/: 3992 pass, 17 skip, 1 pre-existing
flake (test_async_httpx_del_neuter::test_same_key_replaces_stale_loop_entry,
verified failing on pristine origin/main).
112 lines
4.8 KiB
Python
112 lines
4.8 KiB
Python
"""Live E2E: long-lived prefix caching on Claude via OpenRouter.
|
|
|
|
Run only when LIVE_OR_KEY env var is set. Skipped under the normal hermetic
|
|
test suite (which unsets credentials).
|
|
"""
|
|
import os, sys, tempfile, time, shutil, pytest
|
|
|
|
|
|
# Probe for the key BEFORE conftest unsets it
|
|
_LIVE_KEY = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("LIVE_OR_KEY")
|
|
if not _LIVE_KEY:
|
|
# Try to read directly from .env
|
|
env_path = os.path.expanduser("~/.hermes/.env")
|
|
if os.path.exists(env_path):
|
|
with open(env_path) as f:
|
|
for line in f:
|
|
if line.startswith("OPENROUTER_API_KEY="):
|
|
_LIVE_KEY = line.strip().split("=", 1)[1].strip().strip('"').strip("'")
|
|
break
|
|
|
|
|
|
pytestmark = pytest.mark.skipif(
|
|
not _LIVE_KEY,
|
|
reason="set OPENROUTER_API_KEY (or LIVE_OR_KEY) to run live cache test",
|
|
)
|
|
|
|
|
|
def test_long_lived_prefix_cache_e2e_openrouter(tmp_path, monkeypatch):
|
|
"""Two AIAgent runs in fresh sessions: call 1 writes cache, call 2 reads it."""
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
# The hermetic conftest unsets OPENROUTER_API_KEY — restore for this test
|
|
monkeypatch.setenv("OPENROUTER_API_KEY", _LIVE_KEY)
|
|
|
|
# Minimal config — but with enough toolset/guidance to exceed Anthropic's
|
|
# ~1024-token minimum-cacheable-prefix threshold. Anthropic silently
|
|
# ignores cache_control markers on small blocks.
|
|
import yaml
|
|
cfg_path = tmp_path / "config.yaml"
|
|
cfg_path.write_text(yaml.safe_dump({
|
|
"model": {"provider": "openrouter", "default": "anthropic/claude-haiku-4.5"},
|
|
"prompt_caching": {"long_lived_prefix": True, "long_lived_ttl": "1h", "cache_ttl": "5m"},
|
|
"agent": {"tool_use_enforcement": True}, # adds substantial guidance text
|
|
"memory": {"provider": ""},
|
|
"compression": {"enabled": False},
|
|
}))
|
|
|
|
from run_agent import AIAgent
|
|
|
|
def make_agent():
|
|
return AIAgent(
|
|
api_key=_LIVE_KEY,
|
|
base_url="https://openrouter.ai/api/v1",
|
|
provider="openrouter",
|
|
model="anthropic/claude-haiku-4.5",
|
|
api_mode="chat_completions",
|
|
# Use the default toolset roster — the tools array (~13k tokens
|
|
# for ~35 tools) is what carries the bulk of the cross-session
|
|
# cache value. With a tiny toolset the cached prefix can fall
|
|
# below Anthropic Haiku's 2048-token minimum cacheable size and
|
|
# the marker is silently ignored.
|
|
enabled_toolsets=None,
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
save_trajectories=False,
|
|
)
|
|
|
|
a1 = make_agent()
|
|
assert a1._use_prompt_caching is True, "policy should enable caching for Claude on OR"
|
|
assert a1._use_long_lived_prefix_cache is True, "long-lived path should activate"
|
|
parts = a1._build_system_prompt_parts()
|
|
print(f"\nstable={len(parts['stable']):,} ctx={len(parts['context']):,} volatile={len(parts['volatile']):,} chars")
|
|
print(f"tool count: {len(a1.tools or [])}")
|
|
|
|
# Use distinct user messages each call so OpenRouter's response cache
|
|
# doesn't short-circuit the upstream Anthropic call (we need real
|
|
# Anthropic billing visibility to verify cache_creation/cache_read).
|
|
USER_1 = "Reply with the single word ALPHA."
|
|
USER_2 = "Reply with the single word BRAVO."
|
|
|
|
print("\n--- Call 1 (cold) ---")
|
|
r1 = a1.run_conversation(USER_1, conversation_history=[])
|
|
print(f"final_response[:80]: {(r1.get('final_response') or '')[:80]!r}")
|
|
cr1 = a1.session_cache_read_tokens
|
|
cw1 = a1.session_cache_write_tokens
|
|
print(f"call1: cache_read={cr1} cache_write={cw1}")
|
|
|
|
# Wait so cache settles, then fresh agent (NEW SESSION) for cross-session read
|
|
time.sleep(2)
|
|
a2 = make_agent()
|
|
assert a2.session_id != a1.session_id, "second agent must have a new session"
|
|
|
|
print("\n--- Call 2 (warm, NEW session, different user msg) ---")
|
|
r2 = a2.run_conversation(USER_2, conversation_history=[])
|
|
print(f"final_response[:80]: {(r2.get('final_response') or '')[:80]!r}")
|
|
cr2 = a2.session_cache_read_tokens
|
|
cw2 = a2.session_cache_write_tokens
|
|
print(f"call2: cache_read={cr2} cache_write={cw2}")
|
|
|
|
print(f"\n=== VERDICT ===")
|
|
print(f" call1 wrote {cw1:,} cache tokens, read {cr1:,}")
|
|
print(f" call2 wrote {cw2:,} cache tokens, read {cr2:,}")
|
|
if cw1:
|
|
print(f" cross-session read fraction: cr2/cw1 = {cr2/cw1:.2%}")
|
|
|
|
# Assertions
|
|
assert cw1 > 0, f"call 1 must write cache (got {cw1}); long-lived layout not reaching wire"
|
|
assert cr2 > 0, (
|
|
f"call 2 must read cache cross-session (got {cr2}); "
|
|
f"stable prefix is not byte-stable across sessions"
|
|
)
|
|
assert cr2 >= 1000, f"cache_read on call 2 ({cr2}) too small to indicate real reuse"
|