fix(auxiliary): screen fallback chain by context window for compression (#52392)

The runtime auxiliary fallback chain (_try_configured_fallback_chain and
_try_main_fallback_chain) returned the first reachable candidate without
checking whether the candidate's context window was large enough for the
task. For task='compression' this meant a reachable but undersized
fallback (e.g. 32K) could be selected and then fail, even when a later
larger-context fallback was available.

This adds two small helpers:

  _task_minimum_context_length(task)
      Returns MINIMUM_CONTEXT_LENGTH (64K) for compression, None for
      other tasks (vision, web_extract, etc.).

  _candidate_context_window(provider, model, ...)
      Thin wrapper around get_model_context_length that returns None on
      probe failure so unknown/custom endpoints pass through unchanged
      (preserves the existing fallback surface).

Both fallback loops now skip reachable candidates whose resolved context
is below the task minimum and continue iterating. The success path
(first viable candidate wins) is unchanged. Return shape and ordering
for healthy candidates are preserved.

Six regression tests cover:
  L2 configured chain skips too-small candidate
  L2 chain continues after skipping, returns last viable
  L3 main chain skips too-small candidate
  L4 unknown-context candidate passes through
  L5 non-compression task is not filtered
  L6 minimum constant matches MINIMUM_CONTEXT_LENGTH (64K)

3/6 fail on upstream/main without the production change (verified); all
6 pass with the fix. Full test_auxiliary_client.py suite (231 tests)
and related compression tests (130 tests) remain green.
This commit is contained in:
Tranquil-Flow 2026-06-25 10:27:23 +02:00 committed by Teknium
parent b82c83d320
commit e4d026aa3b
2 changed files with 367 additions and 0 deletions

View file

@ -101,6 +101,7 @@ class _OpenAIProxy:
OpenAI = _OpenAIProxy() # module-level name, resolves lazily on call/isinstance
from agent.credential_pool import load_pool
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH, get_model_context_length
from hermes_cli.config import get_hermes_home
from hermes_constants import OPENROUTER_BASE_URL
from utils import base_url_host_matches, base_url_hostname, env_float, model_forces_max_completion_tokens, normalize_proxy_env_vars
@ -3149,6 +3150,88 @@ def _try_main_agent_model_fallback(
return client, resolved_model or main_model, label
# ── Context-window screening for runtime fallback chains (issue #52392) ──
#
# When the runtime auxiliary fallback chain selects a candidate that is
# reachable but has a context window smaller than the compression task
# requires, the call errors out instead of continuing to the next, viable
# candidate. The startup feasibility check in
# ``agent.conversation_compression.check_compression_model_feasibility``
# already filters too-small auxiliary models at startup, but the runtime
# fallback chain (``_try_configured_fallback_chain`` and
# ``_try_main_fallback_chain``) does not apply the same filter, so
# compression can stop at the first alive door even if the room behind it
# is too small.
#
# The helpers below screen each candidate by its effective context window
# before it is returned. ``None`` results from ``get_model_context_length``
# are passed through (we cannot prove a model is too small, so we do not
# block it). This preserves the existing fallback surface for
# unrecognised/custom models while closing the gap on the well-known ones.
def _task_minimum_context_length(task: Optional[str]) -> Optional[int]:
"""Return the minimum context length required for an auxiliary task.
Only ``compression`` carries an explicit minimum today (the same
``MINIMUM_CONTEXT_LENGTH`` (64K) floor that
``check_compression_model_feasibility`` already enforces at startup).
Other tasks (``vision``, ``title_generation``, ``web_extract``,
``skills_hub``, ``mcp``, ``session_search``) return ``None`` they
have no per-task context floor and the runtime chain must remain
permissive for them.
Returns ``None`` for an empty/``None`` task name so the helper is a
safe no-op when called from generic sites.
"""
if not task:
return None
if task == "compression":
return MINIMUM_CONTEXT_LENGTH
return None
def _candidate_context_window(
provider: str,
model: str,
base_url: str = "",
api_key: str = "",
) -> Optional[int]:
"""Resolve the effective context window for a fallback candidate.
Thin wrapper around :func:`agent.model_metadata.get_model_context_length`
that swallows probe failures (returns ``None``). Callers treat
``None`` as "unknown — pass through" so the existing fallback
surface is preserved when the context-length resolver chain cannot
determine a value (custom endpoints, models not in the registry,
offline endpoints).
Best-effort, never raises the runtime fallback chain must keep
moving even if the resolver hits a probe error.
"""
if not model:
return None
try:
ctx = get_model_context_length(
model,
base_url=base_url,
api_key=api_key,
provider=provider,
)
except Exception as exc:
logger.debug(
"Auxiliary fallback: could not resolve context window for %s/%s: %s",
provider, model, exc,
)
return None
# ``get_model_context_length`` returns an int (with a 256K default
# fallback when nothing else matches). We still propagate ``None`` if
# a future change returns ``Optional[int]`` — being explicit is
# cheap and the test suite covers both shapes.
if isinstance(ctx, int) and ctx > 0:
return ctx
return None
def _try_configured_fallback_chain(
task: str,
failed_provider: str,
@ -3173,6 +3256,7 @@ def _try_configured_fallback_chain(
skip = failed_provider.lower().strip()
tried = []
min_ctx = _task_minimum_context_length(task)
for i, entry in enumerate(chain):
if not isinstance(entry, dict):
@ -3190,6 +3274,20 @@ def _try_configured_fallback_chain(
fb_client, resolved_model = None, None
if fb_client is not None:
if min_ctx is not None and resolved_model:
fb_ctx = _candidate_context_window(
fb_provider,
resolved_model,
base_url=str(entry.get("base_url") or ""),
api_key=_fallback_entry_api_key(entry) or "",
)
if fb_ctx is not None and fb_ctx < min_ctx:
logger.info(
"Auxiliary %s: skipping %s (%s context=%d < min=%d), continuing chain",
task, label, resolved_model, fb_ctx, min_ctx,
)
tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
continue
logger.info(
"Auxiliary %s: %s on %s — configured fallback to %s (%s)",
task, reason, failed_provider, label, resolved_model or fb_model or "default",
@ -3285,6 +3383,7 @@ def _try_main_fallback_chain(
main_norm = (_read_main_provider() or "").strip().lower()
skip = {p for p in (failed_norm, main_norm, "auto") if p}
tried: List[str] = []
min_ctx = _task_minimum_context_length(task)
for i, entry in enumerate(chain):
if not isinstance(entry, dict):
@ -3308,6 +3407,20 @@ def _try_main_fallback_chain(
logger.debug("Auxiliary %s: main fallback %s failed to resolve: %s", task or "call", label, exc)
fb_client, resolved_model = None, None
if fb_client is not None:
if min_ctx is not None:
fb_ctx = _candidate_context_window(
fb_provider,
resolved_model or fb_model,
base_url=str(entry.get("base_url") or ""),
api_key=_fallback_entry_api_key(entry) or "",
)
if fb_ctx is not None and fb_ctx < min_ctx:
logger.info(
"Auxiliary %s: skipping %s (context=%d < min=%d), continuing chain",
task or "call", label, fb_ctx, min_ctx,
)
tried.append(f"{label} (context too small: {fb_ctx}<{min_ctx})")
continue
logger.info(
"Auxiliary %s: %s on %s — main fallback chain to %s (%s)",
task or "call", reason, failed_provider or "auto", label,

View file

@ -4198,3 +4198,257 @@ class TestAuxiliaryMaxTokensParam:
):
assert auxiliary_max_tokens_param(4096, model="") == {"max_tokens": 4096}
assert auxiliary_max_tokens_param(4096, model=None) == {"max_tokens": 4096}
# ── Regression tests for issue #52392 ─────────────────────────────────────
# Compression fallback chain currently picks the first reachable candidate
# without checking whether the candidate's context window is large enough.
# When the chosen candidate is reachable but too small for the compression
# task, the call errors out instead of continuing through the chain.
class TestCompressionFallbackContextFilter:
"""Aux fallback chains must skip candidates whose context window is
smaller than the task minimum, then continue to the next candidate.
Layer coverage:
L2: _try_configured_fallback_chain skips too-small candidates
L3: _try_main_fallback_chain skips too-small candidates
L4: candidates with unknown context (None) are passed through
L5: backward compat first viable candidate still wins
"""
@staticmethod
def _make_chain_entry(provider, model, base_url="https://example.com/v1",
api_key="k"):
return {
"provider": provider,
"model": model,
"base_url": base_url,
"api_key": api_key,
}
def _mock_resolve(self, entry):
"""Mock _resolve_fallback_entry to return a (client, model) per entry."""
client = MagicMock()
client.base_url = entry.get("base_url", "")
return client, entry["model"]
# ── L2: configured fallback chain ─────────────────────────────────
def test_configured_chain_skips_too_small_candidate_for_compression(self, monkeypatch):
"""When entry[0] is reachable but too small and entry[1] is large enough,
_try_configured_fallback_chain must return entry[1], not entry[0]."""
from agent.auxiliary_client import (
_try_configured_fallback_chain,
)
small_client = MagicMock(name="small_client")
large_client = MagicMock(name="large_client")
entries = [
self._make_chain_entry("small-provider", "tiny-8k"),
self._make_chain_entry("big-provider", "huge-1m"),
]
def fake_resolve(entry):
if entry is entries[0]:
return small_client, "tiny-8k"
return large_client, "huge-1m"
# tiny-8k resolves to 8K (below 64K floor); huge-1m resolves to 1M
def fake_ctx(model, base_url="", api_key="", **kwargs):
return {"tiny-8k": 8192, "huge-1m": 1_048_576}.get(model, 256_000)
monkeypatch.setattr(
"agent.auxiliary_client._get_auxiliary_task_config",
lambda task: {"fallback_chain": entries} if task == "compression" else {},
)
with patch("agent.auxiliary_client._resolve_fallback_entry",
side_effect=fake_resolve), \
patch("agent.auxiliary_client.get_model_context_length",
side_effect=fake_ctx):
client, model, label = _try_configured_fallback_chain(
task="compression", failed_provider="auto")
assert client is large_client, (
f"Expected large_client (1M context), got {client}. "
"L2 bug: chain returned the first reachable candidate without "
"screening by context window.")
assert model == "huge-1m"
assert "big-provider" in label
def test_configured_chain_continues_after_skipping_too_small(self, monkeypatch):
"""When all small candidates are skipped and only the last is large enough,
the chain still returns it (does not stop after first filter)."""
from agent.auxiliary_client import _try_configured_fallback_chain
small_client_a = MagicMock(name="small_a")
small_client_b = MagicMock(name="small_b")
large_client = MagicMock(name="large")
entries = [
self._make_chain_entry("p1", "small-a-32k"),
self._make_chain_entry("p2", "small-b-48k"),
self._make_chain_entry("p3", "large-512k"),
]
def fake_resolve(entry):
if entry is entries[0]:
return small_client_a, "small-a-32k"
if entry is entries[1]:
return small_client_b, "small-b-48k"
return large_client, "large-512k"
def fake_ctx(model, base_url="", api_key="", **kwargs):
return {"small-a-32k": 32_000,
"small-b-48k": 48_000,
"large-512k": 512_000}.get(model, 256_000)
monkeypatch.setattr(
"agent.auxiliary_client._get_auxiliary_task_config",
lambda task: {"fallback_chain": entries} if task == "compression" else {},
)
with patch("agent.auxiliary_client._resolve_fallback_entry",
side_effect=fake_resolve), \
patch("agent.auxiliary_client.get_model_context_length",
side_effect=fake_ctx):
client, model, label = _try_configured_fallback_chain(
task="compression", failed_provider="auto")
assert client is large_client
assert model == "large-512k"
# ── L3: main fallback chain ────────────────────────────────────────
def test_main_chain_skips_too_small_candidate_for_compression(self, monkeypatch):
"""Same behaviour for the top-level main-agent fallback chain."""
from agent.auxiliary_client import (
_try_main_fallback_chain,
)
small_client = MagicMock(name="small_main")
large_client = MagicMock(name="large_main")
# Mock load_config + get_fallback_chain to return our controlled chain
chain = [
self._make_chain_entry("p-small", "tiny-16k"),
self._make_chain_entry("p-large", "huge-1m"),
]
def fake_resolve(entry):
if entry is chain[0]:
return small_client, "tiny-16k"
return large_client, "huge-1m"
def fake_ctx(model, base_url="", api_key="", **kwargs):
return {"tiny-16k": 16_384, "huge-1m": 1_048_576}.get(model, 256_000)
monkeypatch.setattr(
"hermes_cli.fallback_config.get_fallback_chain",
lambda cfg: chain,
)
with patch("agent.auxiliary_client._resolve_fallback_entry",
side_effect=fake_resolve), \
patch("agent.auxiliary_client.get_model_context_length",
side_effect=fake_ctx), \
patch("agent.auxiliary_client._is_provider_unhealthy",
return_value=False):
client, model, label = _try_main_fallback_chain(
task="compression", failed_provider="auto")
assert client is large_client, (
f"Expected large_client (1M), got {client}. "
"L3 bug: main chain returned the first reachable candidate "
"without screening by context window.")
assert model == "huge-1m"
# ── L4: unknown context passthrough ────────────────────────────────
def test_configured_chain_passes_through_unknown_context(self, monkeypatch):
"""When get_model_context_length returns None (cannot probe),
the candidate is NOT filtered the existing behaviour of using
the default 256K fallback in the resolver chain is preserved."""
from agent.auxiliary_client import _try_configured_fallback_chain
unknown_client = MagicMock(name="unknown_client")
entries = [self._make_chain_entry("unknown-provider", "unprobed-model")]
def fake_resolve(entry):
return unknown_client, "unprobed-model"
def fake_ctx(model, base_url="", api_key="", **kwargs):
return None # cannot determine context length
monkeypatch.setattr(
"agent.auxiliary_client._get_auxiliary_task_config",
lambda task: {"fallback_chain": entries} if task == "compression" else {},
)
with patch("agent.auxiliary_client._resolve_fallback_entry",
side_effect=fake_resolve), \
patch("agent.auxiliary_client.get_model_context_length",
side_effect=fake_ctx):
client, model, label = _try_configured_fallback_chain(
task="compression", failed_provider="auto")
assert client is unknown_client, (
"L4 bug: candidates with unknown context must be passed through, "
"not blocked. Being unsure is not the same as being too small.")
assert model == "unprobed-model"
# ── L5: backward compat — non-compression tasks unchanged ──────────
def test_non_compression_task_does_not_filter_by_context(self, monkeypatch):
"""For tasks without a context floor (e.g. title_generation, vision),
the chain behaviour is unchanged: first reachable candidate wins."""
from agent.auxiliary_client import _try_configured_fallback_chain
small_client = MagicMock(name="small")
entries = [self._make_chain_entry("p", "tiny-4k")]
def fake_resolve(entry):
return small_client, "tiny-4k"
def fake_ctx(model, base_url="", api_key="", **kwargs):
return 4_096 # small — but title_generation has no floor
monkeypatch.setattr(
"agent.auxiliary_client._get_auxiliary_task_config",
lambda task: {"fallback_chain": entries} if task == "title_generation" else {},
)
with patch("agent.auxiliary_client._resolve_fallback_entry",
side_effect=fake_resolve), \
patch("agent.auxiliary_client.get_model_context_length",
side_effect=fake_ctx):
client, model, label = _try_configured_fallback_chain(
task="title_generation", failed_provider="auto")
assert client is small_client, (
"L5 regression: non-compression tasks must not be filtered "
"by context window. The first reachable candidate should win.")
assert model == "tiny-4k"
# ── End-to-end: configured chain skips too-small for vision too ──
# vision has its own implicit context requirements; test that the
# compression-specific filter does NOT affect vision chains.
def test_compression_task_uses_minimum_context_constant(self):
"""The task minimum for compression must equal MINIMUM_CONTEXT_LENGTH
so the runtime fallback stays consistent with the startup feasibility
check in agent/conversation_compression.py."""
from agent.auxiliary_client import _task_minimum_context_length
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
assert _task_minimum_context_length("compression") == MINIMUM_CONTEXT_LENGTH
# Non-compression tasks have no minimum (None)
assert _task_minimum_context_length("vision") is None
assert _task_minimum_context_length("title_generation") is None
assert _task_minimum_context_length("web_extract") is None
assert _task_minimum_context_length("skills_hub") is None
assert _task_minimum_context_length("mcp") is None
assert _task_minimum_context_length("session_search") is None
# Empty / unknown tasks have no minimum
assert _task_minimum_context_length("") is None
assert _task_minimum_context_length(None) is None