mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(cache): drop ttl=1h on Portal Qwen — Alibaba upstream is 5m-only (#24702)
PR #24151 routed Portal Qwen (qwen3.6-plus) through the prefix_and_2 long-lived cache layout, attaching {"type":"ephemeral","ttl":"1h"} markers to the tools[-1] entry and the stable system-prefix block. That layout works for Portal Claude because Anthropic / OpenRouter on Anthropic routes honour 1h TTL — but Portal Qwen ultimately proxies to Alibaba DashScope, which documents a single "ephemeral" TTL of 5 minutes on its Context Cache. The ttl="1h" qualifier is silently dropped upstream, so the two highest-value breakpoints (tools array + system prefix) never land. Only the rolling-window 5m markers on the last 2 messages cache, which matches the observed ~25% read rate. Fix: keep Portal Qwen on cache_control via _anthropic_prompt_cache_policy returning (True, False), but drop it from _supports_long_lived_anthropic_cache so it rides the standard system_and_3 5m layout (system + last 3 messages, all at 5m). Same 4 breakpoints, all in a TTL the upstream actually honours. Refs: https://www.alibabacloud.com/help/en/model-studio/context-cache https://openrouter.ai/docs/features/prompt-caching (Alibaba Qwen section: "TTL: 5 minutes") - _supports_long_lived_anthropic_cache: Portal scope narrowed back to Claude - tests: flip the two qwen long-lived expectations to False, retitle non_claude_non_qwen_rejected -> non_claude_rejected
This commit is contained in:
parent
d8c4460fe3
commit
2a18b6283b
2 changed files with 29 additions and 15 deletions
19
run_agent.py
19
run_agent.py
|
|
@ -3619,12 +3619,19 @@ class AIAgent:
|
|||
is_claude = "claude" in model_lower
|
||||
is_nous_portal = "nousresearch" in eff_base_url.lower()
|
||||
|
||||
# Nous Portal: Claude AND Qwen both get long-lived caching.
|
||||
# Portal proxies to OpenRouter with identical cache_control
|
||||
# semantics; any model on Portal that accepts envelope-layout
|
||||
# markers via _anthropic_prompt_cache_policy also benefits from
|
||||
# the documented 1h cross-session TTL.
|
||||
if is_nous_portal and (is_claude or "qwen" in model_lower):
|
||||
# Nous Portal Claude rides the 1h prefix_and_2 layout (Portal
|
||||
# proxies to OpenRouter, which honours ttl=1h on Anthropic
|
||||
# routes). Qwen does NOT — Alibaba DashScope (the upstream for
|
||||
# all Qwen routes, including Portal -> OpenRouter -> Alibaba)
|
||||
# documents a single ``ephemeral`` TTL of 5 minutes; ttl="1h"
|
||||
# on Qwen markers is silently ignored upstream, so the
|
||||
# high-value tools[-1] + system-prefix breakpoints never land
|
||||
# and only the 5m rolling-window markers on the last 2 messages
|
||||
# get cached. Portal Qwen still gets cache_control via
|
||||
# _anthropic_prompt_cache_policy returning (True, False) — it
|
||||
# just rides the standard system_and_3 5m layout instead of the
|
||||
# mismatched prefix_and_2 1h layout.
|
||||
if is_nous_portal and is_claude:
|
||||
return True
|
||||
|
||||
if not is_claude:
|
||||
|
|
|
|||
|
|
@ -372,29 +372,36 @@ class TestSupportsLongLivedAnthropicCache:
|
|||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
|
||||
def test_nous_portal_qwen_supported(self):
|
||||
# Portal Qwen rides the same OpenRouter-equivalent transport as
|
||||
# Portal Claude; long-lived (1h cross-session) cache_control
|
||||
# markers apply identically.
|
||||
def test_nous_portal_qwen_NOT_long_lived(self):
|
||||
# Portal Qwen still gets cache_control markers via the standard
|
||||
# system_and_3 5m layout (see _anthropic_prompt_cache_policy
|
||||
# tests above), but it must NOT ride the prefix_and_2 1h layout.
|
||||
# Alibaba DashScope (the upstream for every Qwen route, incl.
|
||||
# Portal -> OpenRouter -> Alibaba) only supports a single
|
||||
# ``ephemeral`` TTL of 5 minutes; ttl="1h" markers are silently
|
||||
# ignored, so the high-value tools[-1] + system-prefix
|
||||
# breakpoints don't land. Stay on system_and_3 instead.
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="qwen3.6-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_nous_portal_qwen_vendored_slug_supported(self):
|
||||
def test_nous_portal_qwen_vendored_slug_NOT_long_lived(self):
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
api_mode="chat_completions",
|
||||
model="qwen/qwen3.6-plus",
|
||||
)
|
||||
assert agent._supports_long_lived_anthropic_cache() is True
|
||||
assert agent._supports_long_lived_anthropic_cache() is False
|
||||
|
||||
def test_nous_portal_non_claude_non_qwen_rejected(self):
|
||||
# Portal long-lived cache scope mirrors policy: Claude or Qwen only.
|
||||
def test_nous_portal_non_claude_rejected(self):
|
||||
# Portal long-lived cache scope is now Claude-only. Qwen
|
||||
# rejection is covered by the dedicated tests above; this
|
||||
# covers everything else (gpt, etc.).
|
||||
agent = _make_agent(
|
||||
provider="nous",
|
||||
base_url="https://inference-api.nousresearch.com/v1",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue