fix(gateway): classify Codex 429 quota as rate-limit, not missing credentials

When the Codex OAuth token endpoint returns 429 (usage-limit / quota
exhaustion), refresh_codex_oauth_pure raised a generic auth error that the
gateway surfaced as 'Primary provider auth failed: No Codex credentials
stored. Run hermes auth', prompting re-auth that cannot lift a quota cap.

Classify 429 distinctly (codex_rate_limited, relogin_required=False) with a
non-alarming quota message that honors Retry-After, log it as
'Primary provider rate-limited (429)', and stop format_auth_error from
appending the re-authenticate remediation. Also log the fallback provider's
literal config key instead of the resolved runtime category.

Refs #32790
This commit is contained in:
konsisumer 2026-05-27 09:07:21 +02:00 committed by Teknium
parent 2bbd53493d
commit f1422ffd77
3 changed files with 155 additions and 6 deletions

View file

@ -1078,14 +1078,19 @@ def _resolve_runtime_agent_kwargs() -> dict:
resolve_runtime_provider,
format_runtime_provider_error,
)
from hermes_cli.auth import AuthError
from hermes_cli.auth import AuthError, is_rate_limited_auth_error
try:
runtime = resolve_runtime_provider()
except AuthError as auth_exc:
# Primary provider auth failed (expired token, revoked key, etc.).
# Try the fallback provider chain before raising.
logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc)
# Distinguish a transient rate-limit/quota cap (credentials are fine,
# re-auth cannot help) from a genuine auth failure (expired/revoked
# token). Both fall through to the fallback chain, but the log message
# must not mislabel a quota exhaustion as an auth failure (#32790).
if is_rate_limited_auth_error(auth_exc):
logger.warning("Primary provider rate-limited (429): %s — trying fallback", auth_exc)
else:
logger.warning("Primary provider auth failed: %s — trying fallback", auth_exc)
fb_config = _try_resolve_fallback_provider()
if fb_config is not None:
return fb_config
@ -1131,9 +1136,13 @@ def _try_resolve_fallback_provider() -> dict | None:
explicit_base_url=entry.get("base_url"),
explicit_api_key=explicit_api_key,
)
# Log the literal `provider` key from config, not the resolved
# runtime category — an Ollama fallback resolves through the
# OpenAI-compatible path and would otherwise be logged as
# "openrouter", contradicting the operator's config (#32790).
logger.info(
"Fallback provider resolved: %s model=%s",
runtime.get("provider"),
entry.get("provider") or runtime.get("provider"),
entry.get("model"),
)
return {

View file

@ -729,6 +729,12 @@ def _resolve_zai_base_url(api_key: str, default_url: str, env_override: str) ->
# Error Types
# =============================================================================
# Error code marking upstream rate-limit / usage-quota exhaustion (HTTP 429).
# Such failures are transient and re-authenticating cannot resolve them, so
# they must be kept distinct from missing/expired-credential errors.
CODEX_RATE_LIMITED_CODE = "codex_rate_limited"
class AuthError(RuntimeError):
"""Structured auth error with UX mapping hints."""
@ -746,11 +752,52 @@ class AuthError(RuntimeError):
self.relogin_required = relogin_required
def is_rate_limited_auth_error(error: Exception) -> bool:
"""True when an :class:`AuthError` represents upstream rate-limiting / quota
exhaustion rather than missing or invalid credentials.
These failures are transient re-authenticating cannot resolve them so
callers should surface a "retry later" notice and prefer a fallback chain
instead of prompting the operator to run ``hermes auth``.
"""
return (
isinstance(error, AuthError)
and not error.relogin_required
and error.code == CODEX_RATE_LIMITED_CODE
)
def _parse_retry_after_seconds(headers: Any) -> Optional[int]:
"""Best-effort parse of a ``Retry-After`` header into whole seconds.
Supports the delta-seconds form (e.g. ``"120"``). HTTP-date forms and
missing/unparseable values return ``None`` rather than guessing.
"""
if headers is None:
return None
try:
raw = headers.get("retry-after")
except Exception:
return None
if raw is None:
return None
try:
seconds = int(str(raw).strip())
except (TypeError, ValueError):
return None
return seconds if seconds >= 0 else None
def format_auth_error(error: Exception) -> str:
"""Map auth failures to concise user-facing guidance."""
if not isinstance(error, AuthError):
return str(error)
# Rate-limit / quota errors are not credential problems — never append the
# "re-authenticate" remediation, which would mislead the operator.
if is_rate_limited_auth_error(error):
return str(error)
if error.relogin_required:
return f"{error} Run `hermes model` to re-authenticate."
@ -3308,6 +3355,30 @@ def refresh_codex_oauth_pure(
},
)
if response.status_code == 429:
# Upstream rate-limit / usage-quota exhaustion on the token endpoint.
# The stored refresh token is still valid here — re-authenticating
# cannot lift a quota cap. Classify distinctly from auth failures so
# callers surface a "retry later" notice instead of a misleading
# "run hermes auth" prompt (see issue #32790).
retry_after = _parse_retry_after_seconds(getattr(response, "headers", None))
if retry_after is not None:
message = (
f"Codex provider quota exhausted (429); retry after {retry_after}s. "
"Credentials are still valid."
)
else:
message = (
"Codex provider quota exhausted (429). Credentials are still valid; "
"retry after the usage limit resets."
)
raise AuthError(
message,
provider="openai-codex",
code=CODEX_RATE_LIMITED_CODE,
relogin_required=False,
)
if response.status_code != 200:
code = "codex_refresh_failed"
message = f"Codex token refresh failed with status {response.status_code}."

View file

@ -263,9 +263,10 @@ def test_resolve_returns_hermes_auth_store_source(tmp_path, monkeypatch):
class _StubHTTPResponse:
def __init__(self, status_code: int, payload):
def __init__(self, status_code: int, payload, headers=None):
self.status_code = status_code
self._payload = payload
self.headers = headers or {}
self.text = json.dumps(payload) if isinstance(payload, (dict, list)) else str(payload)
def json(self):
@ -382,6 +383,74 @@ def test_refresh_falls_back_to_generic_message_on_unparseable_body(monkeypatch):
assert "status 401" in str(err)
def test_refresh_429_classified_as_quota_not_auth_failure(monkeypatch):
"""429 from the token endpoint is a usage-quota cap, not an auth failure.
Regression test for #32790: must NOT force relogin and must carry the
dedicated rate-limit code so callers surface a "retry later" notice rather
than a misleading "run hermes auth".
"""
from hermes_cli.auth import (
CODEX_RATE_LIMITED_CODE,
format_auth_error,
is_rate_limited_auth_error,
)
response = _StubHTTPResponse(
429,
{"error": {"message": "You hit your usage limit.", "code": "usage_limit_reached"}},
headers={"retry-after": "120"},
)
_patch_httpx(monkeypatch, response)
with pytest.raises(AuthError) as exc_info:
refresh_codex_oauth_pure("a-tok", "r-tok")
err = exc_info.value
assert err.code == CODEX_RATE_LIMITED_CODE
assert err.relogin_required is False
assert is_rate_limited_auth_error(err) is True
assert "retry after 120s" in str(err)
# User-facing copy must not tell the operator to re-authenticate.
rendered = format_auth_error(err)
assert "re-authenticate" not in rendered
assert "hermes auth" not in rendered
def test_refresh_429_without_retry_after_header(monkeypatch):
"""429 without a Retry-After header still classifies as quota, no relogin."""
from hermes_cli.auth import CODEX_RATE_LIMITED_CODE
response = _StubHTTPResponse(429, {"error": "rate_limited"})
_patch_httpx(monkeypatch, response)
with pytest.raises(AuthError) as exc_info:
refresh_codex_oauth_pure("a-tok", "r-tok")
err = exc_info.value
assert err.code == CODEX_RATE_LIMITED_CODE
assert err.relogin_required is False
assert "quota exhausted" in str(err).lower()
def test_is_rate_limited_auth_error_distinguishes_credential_errors():
"""Missing/expired credentials must NOT be treated as rate-limit errors."""
from hermes_cli.auth import CODEX_RATE_LIMITED_CODE, is_rate_limited_auth_error
rate_limited = AuthError(
"quota", provider="openai-codex", code=CODEX_RATE_LIMITED_CODE, relogin_required=False
)
missing_creds = AuthError(
"No Codex credentials stored.",
provider="openai-codex",
code="codex_auth_missing",
relogin_required=True,
)
assert is_rate_limited_auth_error(rate_limited) is True
assert is_rate_limited_auth_error(missing_creds) is False
assert is_rate_limited_auth_error(ValueError("nope")) is False
def test_login_openai_codex_force_new_login_skips_existing_reuse_prompt(monkeypatch):
called = {"device_login": 0}