fix(gateway): classify Codex 429 quota as rate-limit, not missing credentials

When the Codex OAuth token endpoint returns 429 (usage-limit / quota exhaustion), refresh_codex_oauth_pure raised a generic auth error that the gateway surfaced as 'Primary provider auth failed: No Codex credentials stored. Run hermes auth', prompting re-auth that cannot lift a quota cap. Classify 429 distinctly (codex_rate_limited, relogin_required=False) with a non-alarming quota message that honors Retry-After, log it as 'Primary provider rate-limited (429)', and stop format_auth_error from appending the re-authenticate remediation. Also log the fallback provider's literal config key instead of the resolved runtime category. Refs #32790
2026-07-16 14:32:34 +00:00 · 2026-05-27 09:07:21 +02:00 · 2026-05-27 09:07:21 +02:00 · f1422ffd77
commit f1422ffd77
parent 2bbd53493d
3 changed files with 155 additions and 6 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1078,14 +1078,19 @@ def _resolve_runtime_agent_kwargs() -> dict:
        resolve_runtime_provider,
        format_runtime_provider_error,
    )
-    from hermes_cli.auth import AuthError
+    from hermes_cli.auth import AuthError, is_rate_limited_auth_error

    try:
        runtime = resolve_runtime_provider()
    except AuthError as auth_exc:
-        # Primary provider auth failed (expired token, revoked key, etc.).
-        # Try the fallback provider chain before raising.
-        logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc)
+        # Distinguish a transient rate-limit/quota cap (credentials are fine,
+        # re-auth cannot help) from a genuine auth failure (expired/revoked
+        # token). Both fall through to the fallback chain, but the log message
+        # must not mislabel a quota exhaustion as an auth failure (#32790).
+        if is_rate_limited_auth_error(auth_exc):
+            logger.warning("Primary provider rate-limited (429): %s — trying fallback", auth_exc)
+        else:
+            logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc)
        fb_config = _try_resolve_fallback_provider()
        if fb_config is not None:
            return fb_config
@ -1131,9 +1136,13 @@ def _try_resolve_fallback_provider() -> dict | None:
                    explicit_base_url=entry.get("base_url"),
                    explicit_api_key=explicit_api_key,
                )
+                # Log the literal `provider` key from config, not the resolved
+                # runtime category — an Ollama fallback resolves through the
+                # OpenAI-compatible path and would otherwise be logged as
+                # "openrouter", contradicting the operator's config (#32790).
                logger.info(
                    "Fallback provider resolved: %s model=%s",
-                    runtime.get("provider"),
+                    entry.get("provider") or runtime.get("provider"),
                    entry.get("model"),
                )
                return {
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@ -729,6 +729,12 @@ def _resolve_zai_base_url(api_key: str, default_url: str, env_override: str) ->
 # Error Types
 # =============================================================================

+# Error code marking upstream rate-limit / usage-quota exhaustion (HTTP 429).
+# Such failures are transient and re-authenticating cannot resolve them, so
+# they must be kept distinct from missing/expired-credential errors.
+CODEX_RATE_LIMITED_CODE = "codex_rate_limited"
+
+
 class AuthError(RuntimeError):
    """Structured auth error with UX mapping hints."""

@ -746,11 +752,52 @@ class AuthError(RuntimeError):
        self.relogin_required = relogin_required


+def is_rate_limited_auth_error(error: Exception) -> bool:
+    """True when an :class:`AuthError` represents upstream rate-limiting / quota
+    exhaustion rather than missing or invalid credentials.
+
+    These failures are transient — re-authenticating cannot resolve them — so
+    callers should surface a "retry later" notice and prefer a fallback chain
+    instead of prompting the operator to run ``hermes auth``.
+    """
+    return (
+        isinstance(error, AuthError)
+        and not error.relogin_required
+        and error.code == CODEX_RATE_LIMITED_CODE
+    )
+
+
+def _parse_retry_after_seconds(headers: Any) -> Optional[int]:
+    """Best-effort parse of a ``Retry-After`` header into whole seconds.
+
+    Supports the delta-seconds form (e.g. ``"120"``). HTTP-date forms and
+    missing/unparseable values return ``None`` rather than guessing.
+    """
+    if headers is None:
+        return None
+    try:
+        raw = headers.get("retry-after")
+    except Exception:
+        return None
+    if raw is None:
+        return None
+    try:
+        seconds = int(str(raw).strip())
+    except (TypeError, ValueError):
+        return None
+    return seconds if seconds >= 0 else None
+
+
 def format_auth_error(error: Exception) -> str:
    """Map auth failures to concise user-facing guidance."""
    if not isinstance(error, AuthError):
        return str(error)

+    # Rate-limit / quota errors are not credential problems — never append the
+    # "re-authenticate" remediation, which would mislead the operator.
+    if is_rate_limited_auth_error(error):
+        return str(error)
+
    if error.relogin_required:
        return f"{error} Run `hermes model` to re-authenticate."

@ -3308,6 +3355,30 @@ def refresh_codex_oauth_pure(
            },
        )

+    if response.status_code == 429:
+        # Upstream rate-limit / usage-quota exhaustion on the token endpoint.
+        # The stored refresh token is still valid here — re-authenticating
+        # cannot lift a quota cap. Classify distinctly from auth failures so
+        # callers surface a "retry later" notice instead of a misleading
+        # "run hermes auth" prompt (see issue #32790).
+        retry_after = _parse_retry_after_seconds(getattr(response, "headers", None))
+        if retry_after is not None:
+            message = (
+                f"Codex provider quota exhausted (429); retry after {retry_after}s. "
+                "Credentials are still valid."
+            )
+        else:
+            message = (
+                "Codex provider quota exhausted (429). Credentials are still valid; "
+                "retry after the usage limit resets."
+            )
+        raise AuthError(
+            message,
+            provider="openai-codex",
+            code=CODEX_RATE_LIMITED_CODE,
+            relogin_required=False,
+        )
+
    if response.status_code != 200:
        code = "codex_refresh_failed"
        message = f"Codex token refresh failed with status {response.status_code}."
--- a/tests/hermes_cli/test_auth_codex_provider.py
+++ b/tests/hermes_cli/test_auth_codex_provider.py
@ -263,9 +263,10 @@ def test_resolve_returns_hermes_auth_store_source(tmp_path, monkeypatch):


 class _StubHTTPResponse:
-    def __init__(self, status_code: int, payload):
+    def __init__(self, status_code: int, payload, headers=None):
        self.status_code = status_code
        self._payload = payload
+        self.headers = headers or {}
        self.text = json.dumps(payload) if isinstance(payload, (dict, list)) else str(payload)

    def json(self):
@ -382,6 +383,74 @@ def test_refresh_falls_back_to_generic_message_on_unparseable_body(monkeypatch):
    assert "status 401" in str(err)


+def test_refresh_429_classified_as_quota_not_auth_failure(monkeypatch):
+    """429 from the token endpoint is a usage-quota cap, not an auth failure.
+
+    Regression test for #32790: must NOT force relogin and must carry the
+    dedicated rate-limit code so callers surface a "retry later" notice rather
+    than a misleading "run hermes auth".
+    """
+    from hermes_cli.auth import (
+        CODEX_RATE_LIMITED_CODE,
+        format_auth_error,
+        is_rate_limited_auth_error,
+    )
+
+    response = _StubHTTPResponse(
+        429,
+        {"error": {"message": "You hit your usage limit.", "code": "usage_limit_reached"}},
+        headers={"retry-after": "120"},
+    )
+    _patch_httpx(monkeypatch, response)
+
+    with pytest.raises(AuthError) as exc_info:
+        refresh_codex_oauth_pure("a-tok", "r-tok")
+
+    err = exc_info.value
+    assert err.code == CODEX_RATE_LIMITED_CODE
+    assert err.relogin_required is False
+    assert is_rate_limited_auth_error(err) is True
+    assert "retry after 120s" in str(err)
+    # User-facing copy must not tell the operator to re-authenticate.
+    rendered = format_auth_error(err)
+    assert "re-authenticate" not in rendered
+    assert "hermes auth" not in rendered
+
+
+def test_refresh_429_without_retry_after_header(monkeypatch):
+    """429 without a Retry-After header still classifies as quota, no relogin."""
+    from hermes_cli.auth import CODEX_RATE_LIMITED_CODE
+
+    response = _StubHTTPResponse(429, {"error": "rate_limited"})
+    _patch_httpx(monkeypatch, response)
+
+    with pytest.raises(AuthError) as exc_info:
+        refresh_codex_oauth_pure("a-tok", "r-tok")
+
+    err = exc_info.value
+    assert err.code == CODEX_RATE_LIMITED_CODE
+    assert err.relogin_required is False
+    assert "quota exhausted" in str(err).lower()
+
+
+def test_is_rate_limited_auth_error_distinguishes_credential_errors():
+    """Missing/expired credentials must NOT be treated as rate-limit errors."""
+    from hermes_cli.auth import CODEX_RATE_LIMITED_CODE, is_rate_limited_auth_error
+
+    rate_limited = AuthError(
+        "quota", provider="openai-codex", code=CODEX_RATE_LIMITED_CODE, relogin_required=False
+    )
+    missing_creds = AuthError(
+        "No Codex credentials stored.",
+        provider="openai-codex",
+        code="codex_auth_missing",
+        relogin_required=True,
+    )
+    assert is_rate_limited_auth_error(rate_limited) is True
+    assert is_rate_limited_auth_error(missing_creds) is False
+    assert is_rate_limited_auth_error(ValueError("nope")) is False
+
+
 def test_login_openai_codex_force_new_login_skips_existing_reuse_prompt(monkeypatch):
    called = {"device_login": 0}