fix(aux): self-heal Nous-routed calls when a pinned model leaves the catalog (#37732)

A long-lived process (gateway, watcher) caches the Nous Portal's recommended-models payload and can pin a model for its whole lifetime. When that model is later dropped from the Nous -> OpenRouter catalog, every auxiliary call 404s with 'model does not exist in our configuration or OpenRouter catalog' until the process restarts. Now such a 404 force-refreshes the Portal recommendation and retries once with the current pick (or the gemini-3-flash-preview default). Scoped to Nous-routed calls only. - _is_model_not_found_error(): 404/400 'not found / does not exist / not a valid model' predicate, excludes billing keywords so it never overlaps _is_payment_error. - _refresh_nous_recommended_model(): force-refresh fetch, returns a model distinct from the one that failed, else the known-good default. - Wired into both call_llm and async_call_llm error chains.
2026-06-09 08:21:50 +00:00 · 2026-06-02 17:14:36 -07:00 · 2026-06-02 17:14:36 -07:00 · ab2472e692
commit ab2472e692
parent bb1c8b6f1a
2 changed files with 236 additions and 0 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1621,6 +1621,47 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
    )


+def _refresh_nous_recommended_model(
+    *, vision: bool, stale_model: Optional[str]
+) -> Optional[str]:
+    """Re-fetch the Nous Portal's recommended model after a stale-model 404.
+
+    Long-lived processes (gateway, watchers) cache the Portal's
+    ``recommended-models`` payload for 10 minutes and, in practice, can pin a
+    model for the whole process lifetime. When that model is later dropped from
+    the Nous → OpenRouter catalog, every auxiliary call 404s with
+    "model does not exist". This forces a fresh Portal fetch and returns a
+    model name to retry with:
+
+      * the Portal's current recommendation for the task, if it differs from
+        the model that just failed; otherwise
+      * ``_NOUS_MODEL`` (google/gemini-3-flash-preview), the known-good default,
+        if it too differs from the failed model.
+
+    Returns ``None`` when no usable alternative is available (e.g. the Portal
+    still recommends the exact model that just 404'd and the default also
+    matches it) — callers should then let the original error propagate.
+    """
+    stale = (stale_model or "").strip().lower()
+    fresh: Optional[str] = None
+    try:
+        from hermes_cli.models import get_nous_recommended_aux_model
+
+        fresh = get_nous_recommended_aux_model(vision=vision, force_refresh=True)
+    except Exception as exc:
+        logger.debug(
+            "Nous recommended-model refresh failed (%s); using default %s",
+            exc, _NOUS_MODEL,
+        )
+    if fresh and fresh.strip().lower() != stale:
+        return fresh
+    # Portal recommendation unchanged or unavailable — fall back to the
+    # hardcoded known-good default, but only if it's actually different.
+    if _NOUS_MODEL.strip().lower() != stale:
+        return _NOUS_MODEL
+    return None
+
+
 def _read_main_model() -> str:
    """Read the user's configured main model from config.yaml.

@ -2451,6 +2492,46 @@ def _is_unsupported_temperature_error(exc: Exception) -> bool:
    return _is_unsupported_parameter_error(exc, "temperature")


+def _is_model_not_found_error(exc: Exception) -> bool:
+    """Detect "the requested model doesn't exist" errors (404 / invalid model).
+
+    This fires when a resolved model name is no longer served by the endpoint
+    — most commonly when a long-lived process pinned a Portal-recommended model
+    that has since been dropped from the Nous → OpenRouter catalog. The Nous
+    proxy returns 404 with a body like::
+
+        Model 'gpt-5.4-mini' not found. The requested model does not exist
+        in our configuration or OpenRouter catalog.
+
+    Distinct from :func:`_is_payment_error` (which also matches some 404s for
+    free-tier/credit language) — this one keys on "does not exist / not found /
+    not a valid model" phrasing, and explicitly excludes the billing keywords
+    that the payment path already owns so the two predicates don't overlap.
+    """
+    status = getattr(exc, "status_code", None)
+    err_lower = str(exc).lower()
+    # Billing/quota 404s belong to _is_payment_error — don't claim them here.
+    if any(kw in err_lower for kw in (
+        "credits", "insufficient funds", "billing", "out of funds",
+        "balance_depleted", "no usable credits", "free tier", "free-tier",
+        "not available on the free tier",
+    )):
+        return False
+    if status not in {404, 400, None}:
+        return False
+    return any(kw in err_lower for kw in (
+        "model does not exist",
+        "does not exist in our configuration",
+        "openrouter catalog",
+        "is not a valid model",
+        "no such model",
+        "model not found",
+        "the model `",            # OpenAI-style: "The model `X` does not exist"
+        "model_not_found",
+        "unknown model",
+    ))
+
+
 def _evict_cached_clients(provider: str) -> None:
    """Drop cached auxiliary clients for a provider so fresh creds are used."""
    normalized = _normalize_aux_provider(provider)
@ -5027,6 +5108,32 @@ def call_llm(
                    raise
                first_err = retry_err

+        # ── Stale-model self-heal (Nous Portal recommendation drift) ───
+        # A long-lived process can pin a Portal-recommended model that has
+        # since been dropped from the Nous → OpenRouter catalog, so every
+        # auxiliary call 404s with "model does not exist". Force a fresh
+        # Portal fetch and retry once with the current recommendation (or the
+        # known-good default). Only applies to Nous-routed calls.
+        _heal_is_nous = (
+            resolved_provider == "nous"
+            or base_url_host_matches(_base_info, "inference-api.nousresearch.com")
+        )
+        if _is_model_not_found_error(first_err) and _heal_is_nous:
+            healed_model = _refresh_nous_recommended_model(
+                vision=(task == "vision"), stale_model=kwargs.get("model"))
+            if healed_model and healed_model != kwargs.get("model"):
+                logger.warning(
+                    "Auxiliary %s: model %r no longer in Nous catalog; "
+                    "retrying with refreshed recommendation %r",
+                    task or "call", kwargs.get("model"), healed_model,
+                )
+                kwargs["model"] = healed_model
+                try:
+                    return _validate_llm_response(
+                        client.chat.completions.create(**kwargs), task)
+                except Exception as retry_err:
+                    first_err = retry_err
+
        # ── Nous auth refresh parity with main agent ──────────────────
        client_is_nous = (
            resolved_provider == "nous"
@ -5464,6 +5571,31 @@ async def async_call_llm(
                    raise
                first_err = retry_err

+        # ── Stale-model self-heal (Nous Portal recommendation drift) ───
+        # See the sync call_llm() path for the rationale: a long-lived process
+        # can pin a Portal-recommended model that has since been dropped from
+        # the Nous → OpenRouter catalog, 404'ing every auxiliary call. Force a
+        # fresh Portal fetch and retry once with the current recommendation.
+        _heal_is_nous = (
+            resolved_provider == "nous"
+            or base_url_host_matches(_client_base, "inference-api.nousresearch.com")
+        )
+        if _is_model_not_found_error(first_err) and _heal_is_nous:
+            healed_model = _refresh_nous_recommended_model(
+                vision=(task == "vision"), stale_model=kwargs.get("model"))
+            if healed_model and healed_model != kwargs.get("model"):
+                logger.warning(
+                    "Auxiliary %s (async): model %r no longer in Nous catalog; "
+                    "retrying with refreshed recommendation %r",
+                    task or "call", kwargs.get("model"), healed_model,
+                )
+                kwargs["model"] = healed_model
+                try:
+                    return _validate_llm_response(
+                        await client.chat.completions.create(**kwargs), task)
+                except Exception as retry_err:
+                    first_err = retry_err
+
        # ── Nous auth refresh parity with main agent ──────────────────
        client_is_nous = (
            resolved_provider == "nous"
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -22,6 +22,8 @@ from agent.auxiliary_client import (
    _get_provider_chain,
    _is_payment_error,
    _is_rate_limit_error,
+    _is_model_not_found_error,
+    _refresh_nous_recommended_model,
    _normalize_aux_provider,
    _try_payment_fallback,
    _resolve_auto,
@ -1298,6 +1300,108 @@ class TestIsPaymentError:
        assert _is_payment_error(exc) is False


+class TestIsModelNotFoundError:
+    """_is_model_not_found_error detects stale/invalid model 404s, distinct
+    from payment errors."""
+
+    def test_nous_openrouter_catalog_404(self):
+        """The exact incident error: a Portal-recommended model dropped from
+        the Nous → OpenRouter catalog."""
+        exc = Exception(
+            "Model 'gpt-5.4-mini' not found. The requested model does not "
+            "exist in our configuration or OpenRouter catalog."
+        )
+        exc.status_code = 404
+        assert _is_model_not_found_error(exc) is True
+
+    def test_openai_style_model_does_not_exist(self):
+        exc = Exception("The model `gpt-9-turbo` does not exist")
+        exc.status_code = 404
+        assert _is_model_not_found_error(exc) is True
+
+    def test_invalid_model_id_400(self):
+        exc = Exception("openrouter/foo/bar is not a valid model ID")
+        exc.status_code = 400
+        assert _is_model_not_found_error(exc) is True
+
+    def test_no_such_model(self):
+        exc = Exception("no such model: phantom-v1")
+        exc.status_code = 400
+        assert _is_model_not_found_error(exc) is True
+
+    def test_billing_404_is_not_model_not_found(self):
+        """Free-tier / credit 404s belong to _is_payment_error, not here —
+        the two predicates must not overlap."""
+        exc = Exception(
+            "Model 'gpt-5' is not available on the free tier. Upgrade."
+        )
+        exc.status_code = 404
+        assert _is_model_not_found_error(exc) is False
+        assert _is_payment_error(exc) is True
+
+    def test_out_of_funds_404_is_not_model_not_found(self):
+        exc = Exception(
+            "Your API key is blocked or out of funds. model_not_found"
+        )
+        exc.status_code = 404
+        # billing keyword wins — payment owns it
+        assert _is_model_not_found_error(exc) is False
+
+    def test_rate_limit_is_not_model_not_found(self):
+        exc = Exception("rate limit exceeded, retry after 5s")
+        exc.status_code = 429
+        assert _is_model_not_found_error(exc) is False
+
+    def test_500_is_not_model_not_found(self):
+        exc = Exception("model does not exist")  # right phrase, wrong status
+        exc.status_code = 500
+        assert _is_model_not_found_error(exc) is False
+
+
+class TestRefreshNousRecommendedModel:
+    """_refresh_nous_recommended_model picks a fresh model after a stale 404."""
+
+    def test_returns_fresh_portal_recommendation(self, monkeypatch):
+        monkeypatch.setattr(
+            "hermes_cli.models.get_nous_recommended_aux_model",
+            lambda **kw: "stepfun/step-3.7-flash:free",
+        )
+        out = _refresh_nous_recommended_model(
+            vision=True, stale_model="openai/gpt-5.4-mini")
+        assert out == "stepfun/step-3.7-flash:free"
+
+    def test_falls_back_to_default_when_portal_matches_stale(self, monkeypatch):
+        """If the Portal still recommends the model that just 404'd, fall back
+        to the known-good default."""
+        monkeypatch.setattr(
+            "hermes_cli.models.get_nous_recommended_aux_model",
+            lambda **kw: "openai/gpt-5.4-mini",
+        )
+        out = _refresh_nous_recommended_model(
+            vision=True, stale_model="openai/gpt-5.4-mini")
+        assert out == "google/gemini-3-flash-preview"
+
+    def test_falls_back_to_default_when_portal_unavailable(self, monkeypatch):
+        def _boom(**kw):
+            raise RuntimeError("portal down")
+        monkeypatch.setattr(
+            "hermes_cli.models.get_nous_recommended_aux_model", _boom)
+        out = _refresh_nous_recommended_model(
+            vision=False, stale_model="some/dead-model")
+        assert out == "google/gemini-3-flash-preview"
+
+    def test_returns_none_when_no_distinct_alternative(self, monkeypatch):
+        """When the failed model IS the default and the Portal has nothing
+        else, there's no usable alternative."""
+        monkeypatch.setattr(
+            "hermes_cli.models.get_nous_recommended_aux_model",
+            lambda **kw: "google/gemini-3-flash-preview",
+        )
+        out = _refresh_nous_recommended_model(
+            vision=False, stale_model="google/gemini-3-flash-preview")
+        assert out is None
+
+
 class TestIsRateLimitError:
    """_is_rate_limit_error detects 429 rate-limit errors warranting fallback."""