diff --git a/gateway/run.py b/gateway/run.py index 3ef8df391b6..cab47854366 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1078,14 +1078,19 @@ def _resolve_runtime_agent_kwargs() -> dict: resolve_runtime_provider, format_runtime_provider_error, ) - from hermes_cli.auth import AuthError + from hermes_cli.auth import AuthError, is_rate_limited_auth_error try: runtime = resolve_runtime_provider() except AuthError as auth_exc: - # Primary provider auth failed (expired token, revoked key, etc.). - # Try the fallback provider chain before raising. - logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc) + # Distinguish a transient rate-limit/quota cap (credentials are fine, + # re-auth cannot help) from a genuine auth failure (expired/revoked + # token). Both fall through to the fallback chain, but the log message + # must not mislabel a quota exhaustion as an auth failure (#32790). + if is_rate_limited_auth_error(auth_exc): + logger.warning("Primary provider rate-limited (429): %s — trying fallback", auth_exc) + else: + logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc) fb_config = _try_resolve_fallback_provider() if fb_config is not None: return fb_config @@ -1131,9 +1136,13 @@ def _try_resolve_fallback_provider() -> dict | None: explicit_base_url=entry.get("base_url"), explicit_api_key=explicit_api_key, ) + # Log the literal `provider` key from config, not the resolved + # runtime category — an Ollama fallback resolves through the + # OpenAI-compatible path and would otherwise be logged as + # "openrouter", contradicting the operator's config (#32790). logger.info( "Fallback provider resolved: %s model=%s", - runtime.get("provider"), + entry.get("provider") or runtime.get("provider"), entry.get("model"), ) return { diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index 21aa566f9e9..c037dab7f11 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -729,6 +729,12 @@ def _resolve_zai_base_url(api_key: str, default_url: str, env_override: str) -> # Error Types # ============================================================================= +# Error code marking upstream rate-limit / usage-quota exhaustion (HTTP 429). +# Such failures are transient and re-authenticating cannot resolve them, so +# they must be kept distinct from missing/expired-credential errors. +CODEX_RATE_LIMITED_CODE = "codex_rate_limited" + + class AuthError(RuntimeError): """Structured auth error with UX mapping hints.""" @@ -746,11 +752,52 @@ class AuthError(RuntimeError): self.relogin_required = relogin_required +def is_rate_limited_auth_error(error: Exception) -> bool: + """True when an :class:`AuthError` represents upstream rate-limiting / quota + exhaustion rather than missing or invalid credentials. + + These failures are transient — re-authenticating cannot resolve them — so + callers should surface a "retry later" notice and prefer a fallback chain + instead of prompting the operator to run ``hermes auth``. + """ + return ( + isinstance(error, AuthError) + and not error.relogin_required + and error.code == CODEX_RATE_LIMITED_CODE + ) + + +def _parse_retry_after_seconds(headers: Any) -> Optional[int]: + """Best-effort parse of a ``Retry-After`` header into whole seconds. + + Supports the delta-seconds form (e.g. ``"120"``). HTTP-date forms and + missing/unparseable values return ``None`` rather than guessing. + """ + if headers is None: + return None + try: + raw = headers.get("retry-after") + except Exception: + return None + if raw is None: + return None + try: + seconds = int(str(raw).strip()) + except (TypeError, ValueError): + return None + return seconds if seconds >= 0 else None + + def format_auth_error(error: Exception) -> str: """Map auth failures to concise user-facing guidance.""" if not isinstance(error, AuthError): return str(error) + # Rate-limit / quota errors are not credential problems — never append the + # "re-authenticate" remediation, which would mislead the operator. + if is_rate_limited_auth_error(error): + return str(error) + if error.relogin_required: return f"{error} Run `hermes model` to re-authenticate." @@ -3308,6 +3355,30 @@ def refresh_codex_oauth_pure( }, ) + if response.status_code == 429: + # Upstream rate-limit / usage-quota exhaustion on the token endpoint. + # The stored refresh token is still valid here — re-authenticating + # cannot lift a quota cap. Classify distinctly from auth failures so + # callers surface a "retry later" notice instead of a misleading + # "run hermes auth" prompt (see issue #32790). + retry_after = _parse_retry_after_seconds(getattr(response, "headers", None)) + if retry_after is not None: + message = ( + f"Codex provider quota exhausted (429); retry after {retry_after}s. " + "Credentials are still valid." + ) + else: + message = ( + "Codex provider quota exhausted (429). Credentials are still valid; " + "retry after the usage limit resets." + ) + raise AuthError( + message, + provider="openai-codex", + code=CODEX_RATE_LIMITED_CODE, + relogin_required=False, + ) + if response.status_code != 200: code = "codex_refresh_failed" message = f"Codex token refresh failed with status {response.status_code}." diff --git a/tests/hermes_cli/test_auth_codex_provider.py b/tests/hermes_cli/test_auth_codex_provider.py index 47dfc4c0843..1fc0bc8e02d 100644 --- a/tests/hermes_cli/test_auth_codex_provider.py +++ b/tests/hermes_cli/test_auth_codex_provider.py @@ -263,9 +263,10 @@ def test_resolve_returns_hermes_auth_store_source(tmp_path, monkeypatch): class _StubHTTPResponse: - def __init__(self, status_code: int, payload): + def __init__(self, status_code: int, payload, headers=None): self.status_code = status_code self._payload = payload + self.headers = headers or {} self.text = json.dumps(payload) if isinstance(payload, (dict, list)) else str(payload) def json(self): @@ -382,6 +383,74 @@ def test_refresh_falls_back_to_generic_message_on_unparseable_body(monkeypatch): assert "status 401" in str(err) +def test_refresh_429_classified_as_quota_not_auth_failure(monkeypatch): + """429 from the token endpoint is a usage-quota cap, not an auth failure. + + Regression test for #32790: must NOT force relogin and must carry the + dedicated rate-limit code so callers surface a "retry later" notice rather + than a misleading "run hermes auth". + """ + from hermes_cli.auth import ( + CODEX_RATE_LIMITED_CODE, + format_auth_error, + is_rate_limited_auth_error, + ) + + response = _StubHTTPResponse( + 429, + {"error": {"message": "You hit your usage limit.", "code": "usage_limit_reached"}}, + headers={"retry-after": "120"}, + ) + _patch_httpx(monkeypatch, response) + + with pytest.raises(AuthError) as exc_info: + refresh_codex_oauth_pure("a-tok", "r-tok") + + err = exc_info.value + assert err.code == CODEX_RATE_LIMITED_CODE + assert err.relogin_required is False + assert is_rate_limited_auth_error(err) is True + assert "retry after 120s" in str(err) + # User-facing copy must not tell the operator to re-authenticate. + rendered = format_auth_error(err) + assert "re-authenticate" not in rendered + assert "hermes auth" not in rendered + + +def test_refresh_429_without_retry_after_header(monkeypatch): + """429 without a Retry-After header still classifies as quota, no relogin.""" + from hermes_cli.auth import CODEX_RATE_LIMITED_CODE + + response = _StubHTTPResponse(429, {"error": "rate_limited"}) + _patch_httpx(monkeypatch, response) + + with pytest.raises(AuthError) as exc_info: + refresh_codex_oauth_pure("a-tok", "r-tok") + + err = exc_info.value + assert err.code == CODEX_RATE_LIMITED_CODE + assert err.relogin_required is False + assert "quota exhausted" in str(err).lower() + + +def test_is_rate_limited_auth_error_distinguishes_credential_errors(): + """Missing/expired credentials must NOT be treated as rate-limit errors.""" + from hermes_cli.auth import CODEX_RATE_LIMITED_CODE, is_rate_limited_auth_error + + rate_limited = AuthError( + "quota", provider="openai-codex", code=CODEX_RATE_LIMITED_CODE, relogin_required=False + ) + missing_creds = AuthError( + "No Codex credentials stored.", + provider="openai-codex", + code="codex_auth_missing", + relogin_required=True, + ) + assert is_rate_limited_auth_error(rate_limited) is True + assert is_rate_limited_auth_error(missing_creds) is False + assert is_rate_limited_auth_error(ValueError("nope")) is False + + def test_login_openai_codex_force_new_login_skips_existing_reuse_prompt(monkeypatch): called = {"device_login": 0}