mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(agent): classify 429 'overloaded' bodies as overloaded, not rate_limit
Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The 429 status path classified these unconditionally as rate_limit with should_rotate_credential=True, so an overloaded provider exhausted the credential pool after two errors — fatal for a single-key user, who has nothing to rotate to. The credential is valid; the server is just busy. Disambiguate the 429 body against a shared _OVERLOADED_PATTERNS list and route overload language to FailoverReason.overloaded (retryable, no rotation), matching the existing 503/529 path and the message-only path (#52890). Genuine rate limits (no overload language) still rotate. Extracted the inline overloaded tuple #52890 added into the shared _OVERLOADED_PATTERNS constant so the status-code and message paths use one list. Closes #14038.
This commit is contained in:
parent
16192103f4
commit
38e7bd8a08
2 changed files with 63 additions and 5 deletions
|
|
@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [
|
|||
"servicequotaexceededexception",
|
||||
]
|
||||
|
||||
# Patterns that indicate provider-side overload, NOT a per-credential rate
|
||||
# limit or billing problem. The credential is valid — the server is just
|
||||
# busy — so the correct recovery is "back off and retry the same key", never
|
||||
# "rotate the credential" (rotating exhausts the pool while the endpoint is
|
||||
# still busy; a single-key user has nothing to rotate to). Some providers
|
||||
# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
|
||||
# status path matches the body against this list before falling through to
|
||||
# the rate_limit default. Phrases are kept narrow and overload-flavoured so a
|
||||
# normal rate-limit message ("you have been rate-limited") doesn't hit this
|
||||
# bucket. (#14038, #15297)
|
||||
_OVERLOADED_PATTERNS = [
|
||||
"overloaded",
|
||||
"temporarily overloaded",
|
||||
"service is temporarily overloaded",
|
||||
"service may be temporarily overloaded",
|
||||
"server is overloaded",
|
||||
"server overloaded",
|
||||
"service overloaded",
|
||||
"service is overloaded",
|
||||
"upstream overloaded",
|
||||
"currently overloaded",
|
||||
"at capacity",
|
||||
"over capacity",
|
||||
]
|
||||
|
||||
# Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
|
||||
_USAGE_LIMIT_PATTERNS = [
|
||||
"usage limit",
|
||||
|
|
@ -863,7 +888,19 @@ def _classify_by_status(
|
|||
)
|
||||
|
||||
if status_code == 429:
|
||||
# Already checked long_context_tier above; this is a normal rate limit
|
||||
# Already checked long_context_tier above. Some providers (notably
|
||||
# Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
|
||||
# code as a true per-credential rate limit, but the credential is
|
||||
# valid and the correct recovery is "back off and retry the same key",
|
||||
# NOT "rotate the credential" (which exhausts the pool while the
|
||||
# endpoint is still busy, and does nothing for a single-key user).
|
||||
# Disambiguate on the error body so an overload 429 takes the
|
||||
# transient-overload path instead of burning the pool. (#14038)
|
||||
if any(p in error_msg for p in _OVERLOADED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.overloaded,
|
||||
retryable=True,
|
||||
)
|
||||
return result_fn(
|
||||
FailoverReason.rate_limit,
|
||||
retryable=True,
|
||||
|
|
@ -1219,10 +1256,7 @@ def _classify_by_message(
|
|||
# e.g. some Anthropic-compatible proxies) classifies as a transient
|
||||
# overload (backoff + retry) instead of falling through to `unknown` or
|
||||
# incorrectly triggering credential rotation.
|
||||
if any(p in error_msg for p in (
|
||||
"overloaded", "temporarily overloaded",
|
||||
"service is temporarily overloaded",
|
||||
)):
|
||||
if any(p in error_msg for p in _OVERLOADED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.overloaded,
|
||||
retryable=True,
|
||||
|
|
|
|||
|
|
@ -386,6 +386,30 @@ class TestClassifyApiError:
|
|||
assert result.retryable is True
|
||||
assert result.should_rotate_credential is False
|
||||
|
||||
def test_429_with_overloaded_body_is_overloaded_not_rate_limit(self):
|
||||
"""Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The credential
|
||||
is valid — the server is just busy — so it must classify as overloaded
|
||||
(back off + retry the same key), NOT rate_limit (which would rotate and
|
||||
exhaust the pool, doing nothing for a single-key user). (#14038)"""
|
||||
e = MockAPIError(
|
||||
"The service may be temporarily overloaded, please try again later",
|
||||
status_code=429,
|
||||
)
|
||||
result = classify_api_error(e, provider="zai")
|
||||
assert result.reason == FailoverReason.overloaded
|
||||
assert result.retryable is True
|
||||
assert result.should_rotate_credential is False
|
||||
|
||||
def test_429_normal_rate_limit_still_rotates(self):
|
||||
"""Guard: a genuine 429 rate limit (no overload language) must still
|
||||
classify as rate_limit and rotate the credential. (#14038)"""
|
||||
e = MockAPIError(
|
||||
"Rate limit exceeded: too many requests", status_code=429
|
||||
)
|
||||
result = classify_api_error(e, provider="zai")
|
||||
assert result.reason == FailoverReason.rate_limit
|
||||
assert result.should_rotate_credential is True
|
||||
|
||||
# ── 5xx that are actually request-validation errors ──
|
||||
# Some OpenAI-compatible gateways (e.g. codex.nekos.me) return
|
||||
# request-validation failures with a 5xx status. These are
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue