diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 57e79f36e07..1f406564def 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -101,6 +101,7 @@ class _OpenAIProxy: OpenAI = _OpenAIProxy() # module-level name, resolves lazily on call/isinstance from agent.credential_pool import load_pool +from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length from hermes_cli.config import get_hermes_home from hermes_constants import OPENROUTER_BASE_URL from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars @@ -3149,6 +3150,88 @@ def _try_main_agent_model_fallback( return client, resolved_model or main_model, label +# ── Context-window screening for runtime fallback chains (issue #52392) ── +# +# When the runtime auxiliary fallback chain selects a candidate that is +# reachable but has a context window smaller than the compression task +# requires, the call errors out instead of continuing to the next, viable +# candidate. The startup feasibility check in +# ``agent.conversation_compression.check_compression_model_feasibility`` +# already filters too-small auxiliary models at startup, but the runtime +# fallback chain (``_try_configured_fallback_chain`` and +# ``_try_main_fallback_chain``) does not apply the same filter, so +# compression can stop at the first alive door even if the room behind it +# is too small. +# +# The helpers below screen each candidate by its effective context window +# before it is returned. ``None`` results from ``get_model_context_length`` +# are passed through (we cannot prove a model is too small, so we do not +# block it). This preserves the existing fallback surface for +# unrecognised/custom models while closing the gap on the well-known ones. + +def _task_minimum_context_length(task: Optional[str]) -> Optional[int]: + """Return the minimum context length required for an auxiliary task. + + Only ``compression`` carries an explicit minimum today (the same + ``MINIMUM_CONTEXT_LENGTH`` (64K) floor that + ``check_compression_model_feasibility`` already enforces at startup). + Other tasks (``vision``, ``title_generation``, ``web_extract``, + ``skills_hub``, ``mcp``, ``session_search``) return ``None`` — they + have no per-task context floor and the runtime chain must remain + permissive for them. + + Returns ``None`` for an empty/``None`` task name so the helper is a + safe no-op when called from generic sites. + """ + if not task: + return None + if task == "compression": + return MINIMUM_CONTEXT_LENGTH + return None + + +def _candidate_context_window( + provider: str, + model: str, + base_url: str = "", + api_key: str = "", +) -> Optional[int]: + """Resolve the effective context window for a fallback candidate. + + Thin wrapper around :func:`agent.model_metadata.get_model_context_length` + that swallows probe failures (returns ``None``). Callers treat + ``None`` as "unknown — pass through" so the existing fallback + surface is preserved when the context-length resolver chain cannot + determine a value (custom endpoints, models not in the registry, + offline endpoints). + + Best-effort, never raises — the runtime fallback chain must keep + moving even if the resolver hits a probe error. + """ + if not model: + return None + try: + ctx = get_model_context_length( + model, + base_url=base_url, + api_key=api_key, + provider=provider, + ) + except Exception as exc: + logger.debug( + "Auxiliary fallback: could not resolve context window for %s/%s: %s", + provider, model, exc, + ) + return None + # ``get_model_context_length`` returns an int (with a 256K default + # fallback when nothing else matches). We still propagate ``None`` if + # a future change returns ``Optional[int]`` — being explicit is + # cheap and the test suite covers both shapes. + if isinstance(ctx, int) and ctx > 0: + return ctx + return None + + def _try_configured_fallback_chain( task: str, failed_provider: str, @@ -3173,6 +3256,7 @@ def _try_configured_fallback_chain( skip = failed_provider.lower().strip() tried = [] + min_ctx = _task_minimum_context_length(task) for i, entry in enumerate(chain): if not isinstance(entry, dict): @@ -3190,6 +3274,20 @@ def _try_configured_fallback_chain( fb_client, resolved_model = None, None if fb_client is not None: + if min_ctx is not None and resolved_model: + fb_ctx = _candidate_context_window( + fb_provider, + resolved_model, + base_url=str(entry.get("base_url") or ""), + api_key=_fallback_entry_api_key(entry) or "", + ) + if fb_ctx is not None and fb_ctx < min_ctx: + logger.info( + "Auxiliary %s: skipping %s (%s context=%d < min=%d), continuing chain", + task, label, resolved_model, fb_ctx, min_ctx, + ) + tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})") + continue logger.info( "Auxiliary %s: %s on %s — configured fallback to %s (%s)", task, reason, failed_provider, label, resolved_model or fb_model or "default", @@ -3285,6 +3383,7 @@ def _try_main_fallback_chain( main_norm = (_read_main_provider() or "").strip().lower() skip = {p for p in (failed_norm, main_norm, "auto") if p} tried: List[str] = [] + min_ctx = _task_minimum_context_length(task) for i, entry in enumerate(chain): if not isinstance(entry, dict): @@ -3308,6 +3407,20 @@ def _try_main_fallback_chain( logger.debug("Auxiliary %s: main fallback %s failed to resolve: %s", task or "call", label, exc) fb_client, resolved_model = None, None if fb_client is not None: + if min_ctx is not None: + fb_ctx = _candidate_context_window( + fb_provider, + resolved_model or fb_model, + base_url=str(entry.get("base_url") or ""), + api_key=_fallback_entry_api_key(entry) or "", + ) + if fb_ctx is not None and fb_ctx < min_ctx: + logger.info( + "Auxiliary %s: skipping %s (context=%d < min=%d), continuing chain", + task or "call", label, fb_ctx, min_ctx, + ) + tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})") + continue logger.info( "Auxiliary %s: %s on %s — main fallback chain to %s (%s)", task or "call", reason, failed_provider or "auto", label, diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index d790920b1ee..0c5c4397381 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -4198,3 +4198,257 @@ class TestAuxiliaryMaxTokensParam: ): assert auxiliary_max_tokens_param(4096, model="") == {"max_tokens": 4096} assert auxiliary_max_tokens_param(4096, model=None) == {"max_tokens": 4096} + + +# ── Regression tests for issue #52392 ───────────────────────────────────── +# Compression fallback chain currently picks the first reachable candidate +# without checking whether the candidate's context window is large enough. +# When the chosen candidate is reachable but too small for the compression +# task, the call errors out instead of continuing through the chain. + +class TestCompressionFallbackContextFilter: + """Aux fallback chains must skip candidates whose context window is + smaller than the task minimum, then continue to the next candidate. + + Layer coverage: + L2: _try_configured_fallback_chain skips too-small candidates + L3: _try_main_fallback_chain skips too-small candidates + L4: candidates with unknown context (None) are passed through + L5: backward compat — first viable candidate still wins + """ + + @staticmethod + def _make_chain_entry(provider, model, base_url="https://example.com/v1", + api_key="k"): + return { + "provider": provider, + "model": model, + "base_url": base_url, + "api_key": api_key, + } + + def _mock_resolve(self, entry): + """Mock _resolve_fallback_entry to return a (client, model) per entry.""" + client = MagicMock() + client.base_url = entry.get("base_url", "") + return client, entry["model"] + + # ── L2: configured fallback chain ───────────────────────────────── + + def test_configured_chain_skips_too_small_candidate_for_compression(self, monkeypatch): + """When entry[0] is reachable but too small and entry[1] is large enough, + _try_configured_fallback_chain must return entry[1], not entry[0].""" + from agent.auxiliary_client import ( + _try_configured_fallback_chain, + ) + + small_client = MagicMock(name="small_client") + large_client = MagicMock(name="large_client") + entries = [ + self._make_chain_entry("small-provider", "tiny-8k"), + self._make_chain_entry("big-provider", "huge-1m"), + ] + + def fake_resolve(entry): + if entry is entries[0]: + return small_client, "tiny-8k" + return large_client, "huge-1m" + + # tiny-8k resolves to 8K (below 64K floor); huge-1m resolves to 1M + def fake_ctx(model, base_url="", api_key="", **kwargs): + return {"tiny-8k": 8192, "huge-1m": 1_048_576}.get(model, 256_000) + + monkeypatch.setattr( + "agent.auxiliary_client._get_auxiliary_task_config", + lambda task: {"fallback_chain": entries} if task == "compression" else {}, + ) + + with patch("agent.auxiliary_client._resolve_fallback_entry", + side_effect=fake_resolve), \ + patch("agent.auxiliary_client.get_model_context_length", + side_effect=fake_ctx): + client, model, label = _try_configured_fallback_chain( + task="compression", failed_provider="auto") + + assert client is large_client, ( + f"Expected large_client (1M context), got {client}. " + "L2 bug: chain returned the first reachable candidate without " + "screening by context window.") + assert model == "huge-1m" + assert "big-provider" in label + + def test_configured_chain_continues_after_skipping_too_small(self, monkeypatch): + """When all small candidates are skipped and only the last is large enough, + the chain still returns it (does not stop after first filter).""" + from agent.auxiliary_client import _try_configured_fallback_chain + + small_client_a = MagicMock(name="small_a") + small_client_b = MagicMock(name="small_b") + large_client = MagicMock(name="large") + entries = [ + self._make_chain_entry("p1", "small-a-32k"), + self._make_chain_entry("p2", "small-b-48k"), + self._make_chain_entry("p3", "large-512k"), + ] + + def fake_resolve(entry): + if entry is entries[0]: + return small_client_a, "small-a-32k" + if entry is entries[1]: + return small_client_b, "small-b-48k" + return large_client, "large-512k" + + def fake_ctx(model, base_url="", api_key="", **kwargs): + return {"small-a-32k": 32_000, + "small-b-48k": 48_000, + "large-512k": 512_000}.get(model, 256_000) + + monkeypatch.setattr( + "agent.auxiliary_client._get_auxiliary_task_config", + lambda task: {"fallback_chain": entries} if task == "compression" else {}, + ) + + with patch("agent.auxiliary_client._resolve_fallback_entry", + side_effect=fake_resolve), \ + patch("agent.auxiliary_client.get_model_context_length", + side_effect=fake_ctx): + client, model, label = _try_configured_fallback_chain( + task="compression", failed_provider="auto") + + assert client is large_client + assert model == "large-512k" + + # ── L3: main fallback chain ──────────────────────────────────────── + + def test_main_chain_skips_too_small_candidate_for_compression(self, monkeypatch): + """Same behaviour for the top-level main-agent fallback chain.""" + from agent.auxiliary_client import ( + _try_main_fallback_chain, + ) + + small_client = MagicMock(name="small_main") + large_client = MagicMock(name="large_main") + + # Mock load_config + get_fallback_chain to return our controlled chain + chain = [ + self._make_chain_entry("p-small", "tiny-16k"), + self._make_chain_entry("p-large", "huge-1m"), + ] + + def fake_resolve(entry): + if entry is chain[0]: + return small_client, "tiny-16k" + return large_client, "huge-1m" + + def fake_ctx(model, base_url="", api_key="", **kwargs): + return {"tiny-16k": 16_384, "huge-1m": 1_048_576}.get(model, 256_000) + + monkeypatch.setattr( + "hermes_cli.fallback_config.get_fallback_chain", + lambda cfg: chain, + ) + + with patch("agent.auxiliary_client._resolve_fallback_entry", + side_effect=fake_resolve), \ + patch("agent.auxiliary_client.get_model_context_length", + side_effect=fake_ctx), \ + patch("agent.auxiliary_client._is_provider_unhealthy", + return_value=False): + client, model, label = _try_main_fallback_chain( + task="compression", failed_provider="auto") + + assert client is large_client, ( + f"Expected large_client (1M), got {client}. " + "L3 bug: main chain returned the first reachable candidate " + "without screening by context window.") + assert model == "huge-1m" + + # ── L4: unknown context passthrough ──────────────────────────────── + + def test_configured_chain_passes_through_unknown_context(self, monkeypatch): + """When get_model_context_length returns None (cannot probe), + the candidate is NOT filtered — the existing behaviour of using + the default 256K fallback in the resolver chain is preserved.""" + from agent.auxiliary_client import _try_configured_fallback_chain + + unknown_client = MagicMock(name="unknown_client") + entries = [self._make_chain_entry("unknown-provider", "unprobed-model")] + + def fake_resolve(entry): + return unknown_client, "unprobed-model" + + def fake_ctx(model, base_url="", api_key="", **kwargs): + return None # cannot determine context length + + monkeypatch.setattr( + "agent.auxiliary_client._get_auxiliary_task_config", + lambda task: {"fallback_chain": entries} if task == "compression" else {}, + ) + + with patch("agent.auxiliary_client._resolve_fallback_entry", + side_effect=fake_resolve), \ + patch("agent.auxiliary_client.get_model_context_length", + side_effect=fake_ctx): + client, model, label = _try_configured_fallback_chain( + task="compression", failed_provider="auto") + + assert client is unknown_client, ( + "L4 bug: candidates with unknown context must be passed through, " + "not blocked. Being unsure is not the same as being too small.") + assert model == "unprobed-model" + + # ── L5: backward compat — non-compression tasks unchanged ────────── + + def test_non_compression_task_does_not_filter_by_context(self, monkeypatch): + """For tasks without a context floor (e.g. title_generation, vision), + the chain behaviour is unchanged: first reachable candidate wins.""" + from agent.auxiliary_client import _try_configured_fallback_chain + + small_client = MagicMock(name="small") + entries = [self._make_chain_entry("p", "tiny-4k")] + + def fake_resolve(entry): + return small_client, "tiny-4k" + + def fake_ctx(model, base_url="", api_key="", **kwargs): + return 4_096 # small — but title_generation has no floor + + monkeypatch.setattr( + "agent.auxiliary_client._get_auxiliary_task_config", + lambda task: {"fallback_chain": entries} if task == "title_generation" else {}, + ) + + with patch("agent.auxiliary_client._resolve_fallback_entry", + side_effect=fake_resolve), \ + patch("agent.auxiliary_client.get_model_context_length", + side_effect=fake_ctx): + client, model, label = _try_configured_fallback_chain( + task="title_generation", failed_provider="auto") + + assert client is small_client, ( + "L5 regression: non-compression tasks must not be filtered " + "by context window. The first reachable candidate should win.") + assert model == "tiny-4k" + + # ── End-to-end: configured chain skips too-small for vision too ── + # vision has its own implicit context requirements; test that the + # compression-specific filter does NOT affect vision chains. + + def test_compression_task_uses_minimum_context_constant(self): + """The task minimum for compression must equal MINIMUM_CONTEXT_LENGTH + so the runtime fallback stays consistent with the startup feasibility + check in agent/conversation_compression.py.""" + from agent.auxiliary_client import _task_minimum_context_length + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + + assert _task_minimum_context_length("compression") == MINIMUM_CONTEXT_LENGTH + # Non-compression tasks have no minimum (None) + assert _task_minimum_context_length("vision") is None + assert _task_minimum_context_length("title_generation") is None + assert _task_minimum_context_length("web_extract") is None + assert _task_minimum_context_length("skills_hub") is None + assert _task_minimum_context_length("mcp") is None + assert _task_minimum_context_length("session_search") is None + # Empty / unknown tasks have no minimum + assert _task_minimum_context_length("") is None + assert _task_minimum_context_length(None) is None