mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge b0d5d63d9d into 3e61703b08
This commit is contained in:
commit
00f6226155
4 changed files with 268 additions and 4 deletions
|
|
@ -147,6 +147,29 @@ _PAYLOAD_TOO_LARGE_PATTERNS = [
|
|||
"error code: 413",
|
||||
]
|
||||
|
||||
# Patterns that indicate provider-side overload, NOT per-credential rate
|
||||
# limiting. Some providers (e.g. Z.AI / Zhipu) return HTTP 429 for
|
||||
# server-wide overload — same status code as a true rate limit but the
|
||||
# correct recovery is "rotate immediately, the whole endpoint is down"
|
||||
# rather than "back off and retry the same key". Match against the
|
||||
# message body alongside the 429 status code in the classifier. Keep
|
||||
# phrases narrow and provider-language-flavoured so a normal rate-limit
|
||||
# message ("you have been rate-limited") doesn't accidentally hit this
|
||||
# bucket — only true overload language does (#15297).
|
||||
_OVERLOADED_PATTERNS = [
|
||||
"temporarily overloaded",
|
||||
"server is overloaded",
|
||||
"server overloaded",
|
||||
"service overloaded",
|
||||
"service is overloaded",
|
||||
"service may be temporarily overloaded",
|
||||
"upstream overloaded",
|
||||
"is overloaded, please try again",
|
||||
"at capacity",
|
||||
"over capacity",
|
||||
"currently overloaded",
|
||||
]
|
||||
|
||||
# Context overflow patterns
|
||||
_CONTEXT_OVERFLOW_PATTERNS = [
|
||||
"context length",
|
||||
|
|
@ -589,7 +612,20 @@ def _classify_by_status(
|
|||
)
|
||||
|
||||
if status_code == 429:
|
||||
# Already checked long_context_tier above; this is a normal rate limit
|
||||
# Already checked long_context_tier above. Some providers (notably
|
||||
# Z.AI / Zhipu) reuse 429 for server-wide overload — correct recovery
|
||||
# is "skip retry, rotate immediately" rather than "back off, then
|
||||
# rotate" (which is the right answer for a per-credential rate
|
||||
# limit). Disambiguate on the error body so the retry loop picks
|
||||
# the matching strategy via the corresponding ``FailoverReason``
|
||||
# handler (#15297).
|
||||
if any(p in error_msg for p in _OVERLOADED_PATTERNS):
|
||||
return result_fn(
|
||||
FailoverReason.overloaded,
|
||||
retryable=True,
|
||||
should_rotate_credential=True,
|
||||
should_fallback=True,
|
||||
)
|
||||
return result_fn(
|
||||
FailoverReason.rate_limit,
|
||||
retryable=True,
|
||||
|
|
|
|||
24
run_agent.py
24
run_agent.py
|
|
@ -5565,7 +5565,7 @@ class AIAgent:
|
|||
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
|
||||
if next_entry is not None:
|
||||
logger.info(
|
||||
"Credential %s (billing) — rotated to pool entry %s",
|
||||
"HTTP %s (billing) — rotated credential to pool entry %s",
|
||||
rotate_status,
|
||||
getattr(next_entry, "id", "?"),
|
||||
)
|
||||
|
|
@ -5580,7 +5580,7 @@ class AIAgent:
|
|||
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
|
||||
if next_entry is not None:
|
||||
logger.info(
|
||||
"Credential %s (rate limit) — rotated to pool entry %s",
|
||||
"HTTP %s (rate limit) — rotated credential to pool entry %s",
|
||||
rotate_status,
|
||||
getattr(next_entry, "id", "?"),
|
||||
)
|
||||
|
|
@ -5588,6 +5588,24 @@ class AIAgent:
|
|||
return True, False
|
||||
return False, True
|
||||
|
||||
if effective_reason == FailoverReason.overloaded:
|
||||
# Provider-side overload (e.g. Z.AI 429 "service is temporarily
|
||||
# overloaded"): the whole endpoint is struggling, not just this
|
||||
# API key — retrying the same credential burns a slot and
|
||||
# delays recovery. Rotate immediately, same shape as
|
||||
# ``billing`` does, no retry-on-same-credential first (#15297).
|
||||
rotate_status = status_code if status_code is not None else 429
|
||||
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
|
||||
if next_entry is not None:
|
||||
logger.info(
|
||||
"HTTP %s (provider overloaded) — rotated credential to pool entry %s",
|
||||
rotate_status,
|
||||
getattr(next_entry, "id", "?"),
|
||||
)
|
||||
self._swap_credential(next_entry)
|
||||
return True, False
|
||||
return False, has_retried_429
|
||||
|
||||
if effective_reason == FailoverReason.auth:
|
||||
refreshed = pool.try_refresh_current()
|
||||
if refreshed is not None:
|
||||
|
|
@ -5600,7 +5618,7 @@ class AIAgent:
|
|||
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
|
||||
if next_entry is not None:
|
||||
logger.info(
|
||||
"Credential %s (auth refresh failed) — rotated to pool entry %s",
|
||||
"HTTP %s (auth refresh failed) — rotated credential to pool entry %s",
|
||||
rotate_status,
|
||||
getattr(next_entry, "id", "?"),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -232,3 +232,134 @@ class TestPoolRotationCycle:
|
|||
)
|
||||
assert recovered is False
|
||||
assert has_retried is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. FailoverReason.overloaded handler (#15297)
|
||||
#
|
||||
# When the classifier produces ``overloaded`` (e.g. Z.AI 429 "service is
|
||||
# temporarily overloaded"), the recovery path must rotate immediately —
|
||||
# the whole endpoint is down, retrying the same credential burns a slot
|
||||
# and delays recovery. Same shape as ``billing``, distinct from
|
||||
# ``rate_limit`` which retries-first.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPoolOverloadedRotation:
|
||||
"""Verify ``FailoverReason.overloaded`` rotates immediately, no retry-first."""
|
||||
|
||||
def _make_agent_with_pool(self, pool_entries=3):
|
||||
"""Same factory shape as TestPoolRotationCycle — keep them parallel
|
||||
so a future refactor of the agent's pool seam updates both
|
||||
classes uniformly."""
|
||||
from run_agent import AIAgent
|
||||
|
||||
with patch.object(AIAgent, "__init__", lambda self, **kw: None):
|
||||
agent = AIAgent()
|
||||
|
||||
entries = []
|
||||
for i in range(pool_entries):
|
||||
e = MagicMock(name=f"entry_{i}")
|
||||
e.id = f"cred-{i}"
|
||||
entries.append(e)
|
||||
|
||||
pool = MagicMock()
|
||||
pool.has_credentials.return_value = True
|
||||
|
||||
self._rotation_index = 0
|
||||
|
||||
def rotate(status_code=None, error_context=None):
|
||||
self._rotation_index += 1
|
||||
if self._rotation_index < pool_entries:
|
||||
return entries[self._rotation_index]
|
||||
pool.has_credentials.return_value = False
|
||||
return None
|
||||
|
||||
pool.mark_exhausted_and_rotate = MagicMock(side_effect=rotate)
|
||||
agent._credential_pool = pool
|
||||
agent._swap_credential = MagicMock()
|
||||
agent.log_prefix = ""
|
||||
|
||||
return agent, pool, entries
|
||||
|
||||
def test_overloaded_rotates_immediately_on_first_failure(self):
|
||||
"""The #15297 fix: ``overloaded`` must NOT use the rate-limit
|
||||
retry-first path. First failure rotates straight away because
|
||||
retrying the same credential against a saturated provider just
|
||||
burns the next ``has_retried_429`` slot too."""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
agent, pool, entries = self._make_agent_with_pool(3)
|
||||
recovered, has_retried = agent._recover_with_credential_pool(
|
||||
status_code=429,
|
||||
has_retried_429=False, # first failure
|
||||
classified_reason=FailoverReason.overloaded,
|
||||
)
|
||||
assert recovered is True, (
|
||||
"overloaded must rotate on first failure, like billing — "
|
||||
"not wait for has_retried_429 like rate_limit does"
|
||||
)
|
||||
assert has_retried is False
|
||||
pool.mark_exhausted_and_rotate.assert_called_once_with(
|
||||
status_code=429, error_context=None,
|
||||
)
|
||||
agent._swap_credential.assert_called_once_with(entries[1])
|
||||
|
||||
def test_overloaded_does_not_block_on_has_retried_flag(self):
|
||||
"""The retry-first gate is specific to rate_limit. An
|
||||
``overloaded`` reason must rotate regardless of whether the
|
||||
retry flag is currently set or unset — same behaviour both
|
||||
ways, no state leak from a previous rate-limit cycle."""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
agent, pool, entries = self._make_agent_with_pool(3)
|
||||
recovered, has_retried = agent._recover_with_credential_pool(
|
||||
status_code=429,
|
||||
has_retried_429=True, # already retried — should still rotate
|
||||
classified_reason=FailoverReason.overloaded,
|
||||
)
|
||||
assert recovered is True
|
||||
assert has_retried is False # reset after rotation
|
||||
pool.mark_exhausted_and_rotate.assert_called_once()
|
||||
|
||||
def test_overloaded_pool_exhaustion_returns_false(self):
|
||||
"""When the pool only has one credential and it overloads, the
|
||||
rotation step returns no next entry — the recovery call must
|
||||
report ``recovered=False`` so the caller falls through to its
|
||||
fallback chain rather than spinning."""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
agent, _pool, _ = self._make_agent_with_pool(1)
|
||||
recovered, has_retried = agent._recover_with_credential_pool(
|
||||
status_code=429,
|
||||
has_retried_429=False,
|
||||
classified_reason=FailoverReason.overloaded,
|
||||
)
|
||||
assert recovered is False, (
|
||||
"single-entry pool that overloads must return recovered=False "
|
||||
"so the outer fallback chain takes over"
|
||||
)
|
||||
# has_retried_429 is propagated unchanged in the no-rotation
|
||||
# branch — pinned so a future tweak to the contract surfaces
|
||||
# via this test.
|
||||
assert has_retried is False
|
||||
|
||||
def test_rate_limit_still_uses_retry_first(self):
|
||||
"""Negative control: ``rate_limit`` must KEEP its retry-first
|
||||
semantics. A regression that broadened the overloaded handler
|
||||
to also catch rate_limit would silently double-rotate every
|
||||
per-key throttle and burn the pool fast."""
|
||||
from agent.error_classifier import FailoverReason
|
||||
|
||||
agent, pool, _ = self._make_agent_with_pool(3)
|
||||
recovered, has_retried = agent._recover_with_credential_pool(
|
||||
status_code=429,
|
||||
has_retried_429=False,
|
||||
classified_reason=FailoverReason.rate_limit,
|
||||
)
|
||||
assert recovered is False, (
|
||||
"rate_limit on first failure must NOT rotate — the retry-"
|
||||
"first path is the contract"
|
||||
)
|
||||
assert has_retried is True
|
||||
pool.mark_exhausted_and_rotate.assert_not_called()
|
||||
|
|
|
|||
|
|
@ -250,6 +250,85 @@ class TestClassifyApiError:
|
|||
assert result.reason == FailoverReason.rate_limit
|
||||
assert result.should_fallback is True
|
||||
|
||||
# ── 429 overload disambiguation (#15297) ──
|
||||
#
|
||||
# Some providers (notably Z.AI / Zhipu) reuse 429 for server-side
|
||||
# overload rather than per-credential rate limiting. Recovery for
|
||||
# these is "rotate immediately, the whole endpoint is down" — not
|
||||
# the rate-limit "back off, retry same key once, then rotate" path.
|
||||
# The classifier disambiguates on the message body so the retry loop
|
||||
# picks the right ``FailoverReason`` handler.
|
||||
|
||||
def test_429_zai_temporarily_overloaded(self):
|
||||
"""The exact message from #15297's Z.AI / Zhipu reproduction."""
|
||||
msg = (
|
||||
"HTTP 429: The service may be temporarily overloaded, "
|
||||
"please try again later"
|
||||
)
|
||||
e = MockAPIError(msg, status_code=429)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.overloaded, (
|
||||
f"Z.AI overload message must classify as overloaded, got "
|
||||
f"{result.reason!r} — recovery would burn a same-credential "
|
||||
f"retry instead of rotating immediately"
|
||||
)
|
||||
assert result.should_rotate_credential is True
|
||||
assert result.should_fallback is True
|
||||
assert result.retryable is True
|
||||
|
||||
@pytest.mark.parametrize("phrase", [
|
||||
"Service is overloaded, please try again later",
|
||||
"server is overloaded",
|
||||
"server overloaded — retry",
|
||||
"Service overloaded",
|
||||
"Upstream overloaded",
|
||||
"API is currently overloaded",
|
||||
"is overloaded, please try again",
|
||||
"The endpoint is at capacity",
|
||||
"Provider over capacity",
|
||||
])
|
||||
def test_429_overload_phrases_route_to_overloaded(self, phrase):
|
||||
"""Phrase matrix: any provider-language overload phrase on a 429
|
||||
must route to ``overloaded`` not ``rate_limit``. Parametrised
|
||||
so a future provider quirk that reads "overloaded" but is really
|
||||
rate limiting can be surfaced and reclassified narrowly."""
|
||||
e = MockAPIError(phrase, status_code=429)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.overloaded, (
|
||||
f"phrase {phrase!r} should map to overloaded; got {result.reason!r}"
|
||||
)
|
||||
|
||||
def test_429_plain_rate_limit_remains_rate_limit(self):
|
||||
"""Regression guard: a plain 429 with rate-limit-flavoured text
|
||||
(no overload phrase) must STILL classify as rate_limit so the
|
||||
backoff-then-rotate path keeps working for normal per-key
|
||||
throttling. Without this guard, the disambiguation could
|
||||
silently broaden and break every 429 recovery."""
|
||||
e = MockAPIError(
|
||||
"Rate limit reached for requests per minute", status_code=429,
|
||||
)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.rate_limit, (
|
||||
"vanilla rate-limit message must NOT route to overloaded — "
|
||||
"the recovery strategies are different (retry-then-rotate vs "
|
||||
"rotate-immediately)"
|
||||
)
|
||||
|
||||
def test_429_too_many_requests_remains_rate_limit(self):
|
||||
"""The literal HTTP 429 reason phrase ('Too Many Requests') must
|
||||
route to rate_limit too — it carries no overload language."""
|
||||
e = MockAPIError("Too Many Requests", status_code=429)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.rate_limit
|
||||
|
||||
def test_503_overloaded_path_unchanged(self):
|
||||
"""Sanity check: HTTP 503 already mapped to ``overloaded`` and
|
||||
must keep doing so. Adding the 429 overload branch must not
|
||||
accidentally regress the 503 path."""
|
||||
e = MockAPIError("Service Unavailable", status_code=503)
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.overloaded
|
||||
|
||||
def test_alibaba_rate_increased_too_quickly(self):
|
||||
"""Alibaba/DashScope returns a unique throttling message.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue