From 6f2b2a1f34d1dbeaf67a02e7212485f0f77ed116 Mon Sep 17 00:00:00 2001 From: benbenwyb Date: Tue, 16 Jun 2026 19:33:21 +0800 Subject: [PATCH] fix: handle named custom providers and Z.AI overload retries --- agent/agent_init.py | 14 ++- agent/conversation_loop.py | 28 +++++- agent/retry_utils.py | 72 ++++++++++++++ .../agent/test_custom_provider_extra_body.py | 31 ++++++ tests/test_retry_utils.py | 97 ++++++++++++++++++- 5 files changed, 237 insertions(+), 5 deletions(-) diff --git a/agent/agent_init.py b/agent/agent_init.py index 065299be05a..4436daa7842 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -106,7 +106,12 @@ def _custom_provider_extra_body_for_agent( base_url: str, custom_providers: List[Dict[str, Any]], ) -> Optional[Dict[str, Any]]: - if (provider or "").strip().lower() != "custom": + provider_norm = (provider or "").strip().lower() + if provider_norm == "custom": + provider_key_filter = "" + elif provider_norm.startswith("custom:"): + provider_key_filter = provider_norm.split(":", 1)[1].strip() + else: return None target_url = _normalized_custom_base_url(base_url) @@ -117,6 +122,13 @@ def _custom_provider_extra_body_for_agent( for entry in custom_providers or []: if not isinstance(entry, dict): continue + if provider_key_filter: + entry_keys = { + str(entry.get("provider_key", "") or "").strip().lower(), + str(entry.get("name", "") or "").strip().lower(), + } + if provider_key_filter not in entry_keys: + continue if _normalized_custom_base_url(entry.get("base_url")) != target_url: continue extra_body = entry.get("extra_body") diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index b647e39c980..3526029ddab 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -55,7 +55,7 @@ from agent.model_metadata import ( ) from agent.process_bootstrap import _install_safe_stdio from agent.prompt_caching import apply_anthropic_cache_control -from agent.retry_utils import jittered_backoff +from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff from agent.trajectory import has_incomplete_scratchpad from agent.usage_pricing import estimate_usage_cost, normalize_usage from hermes_constants import PARTIAL_STREAM_STUB_ID @@ -3537,16 +3537,38 @@ def run_conversation( except (TypeError, ValueError): pass wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) + _backoff_policy = None + if is_rate_limited and not _retry_after: + wait_time, _backoff_policy = adaptive_rate_limit_backoff( + retry_count, + base_url=str(_base), + model=_model, + error=api_error, + default_wait=wait_time, + ) if is_rate_limited: - agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...") + _policy_note = "" + if _backoff_policy == "zai_coding_overload_long": + _policy_note = " (Z.AI Coding overload adaptive long backoff)" + elif _backoff_policy == "zai_coding_overload_short": + _policy_note = " (Z.AI Coding overload short retry)" + _rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..." + # Normal retries are buffered to avoid noisy transient chatter. Long + # Z.AI Coding waits are different: they can last minutes, so surface + # progress immediately instead of making the TUI look frozen. + if _backoff_policy == "zai_coding_overload_long": + agent._emit_status(_rate_limit_status) + else: + agent._buffer_status(_rate_limit_status) else: agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...") logger.warning( - "Retrying API call in %ss (attempt %s/%s) %s error=%s", + "Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s", wait_time, retry_count, max_retries, agent._client_log_context(), + _backoff_policy or "default", api_error, ) # Sleep in small increments so we can respond to interrupts quickly diff --git a/agent/retry_utils.py b/agent/retry_utils.py index 71d6963f7b4..2922156847b 100644 --- a/agent/retry_utils.py +++ b/agent/retry_utils.py @@ -8,6 +8,7 @@ rate-limited provider concurrently. import random import threading import time +from typing import Any # Monotonic counter for jitter seed uniqueness within the same process. # Protected by a lock to avoid race conditions in concurrent retry paths @@ -15,6 +16,14 @@ import time _jitter_counter = 0 _jitter_lock = threading.Lock() +# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305 +# ("The service may be temporarily overloaded...") for otherwise valid +# Hermes requests. Short retries tend to hammer the same overloaded window; +# after a few normal retries, progressively widen the wait window. Keep the +# cap interactive-friendly: a simple TUI message should fail visibly in minutes, +# not sit silent for 20+ minutes. +_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0) + def jittered_backoff( attempt: int, @@ -55,3 +64,66 @@ def jittered_backoff( jitter = rng.uniform(0, jitter_ratio * delay) return delay + jitter + + +def _error_text(error: Any) -> str: + """Best-effort flattened provider error text for retry classification.""" + parts = [ + error, + getattr(error, "message", None), + getattr(error, "body", None), + getattr(error, "response", None), + ] + return " ".join(str(part) for part in parts if part is not None).lower() + + +def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool: + """Return True for Z.AI Coding Plan transient overload 429s. + + The coding-plan endpoint reports overload as HTTP 429 with body code 1305 + and message "The service may be temporarily overloaded...". Treat only + that narrow shape specially so ordinary quota/billing 429s still fail fast + through the existing classifier. + """ + base = (base_url or "").lower() + model_name = (model or "").lower() + status = getattr(error, "status_code", None) + text = _error_text(error) + return ( + status == 429 + and "api.z.ai/api/coding/paas/v4" in base + and "glm-5.2" in model_name + and ("1305" in text or "temporarily overloaded" in text) + ) + + +def adaptive_rate_limit_backoff( + attempt: int, + *, + base_url: str | None, + model: str | None, + error: Any, + default_wait: float, + short_attempts: int = 3, +) -> tuple[float, str | None]: + """Provider-aware rate-limit backoff. + + For most providers this returns ``default_wait`` unchanged. For Z.AI + Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on + the normal short exponential schedule, then switch to progressively longer + waits (30s → 60s → 90s → 120s, capped) plus light jitter. + + ``attempt`` is 1-based, matching the retry loop's logged attempt number. + Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable + for status/log decoration when a provider-specific policy fired. + """ + if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error): + return default_wait, None + if attempt <= short_attempts: + return default_wait, "zai_coding_overload_short" + + idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1) + base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx] + # A smaller jitter ratio keeps long waits readable while still avoiding + # synchronized retry storms across concurrent Hermes sessions. + return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long" diff --git a/tests/agent/test_custom_provider_extra_body.py b/tests/agent/test_custom_provider_extra_body.py index 23556ae62de..a3a1015557a 100644 --- a/tests/agent/test_custom_provider_extra_body.py +++ b/tests/agent/test_custom_provider_extra_body.py @@ -91,3 +91,34 @@ def test_custom_provider_extra_body_ignores_other_custom_models(): ) assert agent.request_overrides == {} + + +def test_named_custom_provider_extra_body_matches_provider_key(): + agent = SimpleNamespace( + provider="custom:zai-coding-plan", + model="glm-5.2", + base_url="https://api.z.ai/api/coding/paas/v4", + request_overrides={}, + ) + + _merge_custom_provider_extra_body( + agent, + [ + { + "provider_key": "other-provider", + "name": "Other Provider", + "base_url": "https://api.z.ai/api/coding/paas/v4", + "model": "glm-5.2", + "extra_body": {"enable_thinking": True}, + }, + { + "provider_key": "zai-coding-plan", + "name": "Z.AI Coding Plan", + "base_url": "https://api.z.ai/api/coding/paas/v4/", + "model": "glm-5.2", + "extra_body": {"enable_thinking": False}, + }, + ], + ) + + assert agent.request_overrides == {"extra_body": {"enable_thinking": False}} diff --git a/tests/test_retry_utils.py b/tests/test_retry_utils.py index f39c3142d9f..ff08d3a4062 100644 --- a/tests/test_retry_utils.py +++ b/tests/test_retry_utils.py @@ -3,7 +3,9 @@ import threading import agent.retry_utils as retry_utils -from agent.retry_utils import jittered_backoff +from types import SimpleNamespace + +from agent.retry_utils import adaptive_rate_limit_backoff, is_zai_coding_overload_error, jittered_backoff def test_backoff_is_exponential(): @@ -115,3 +117,96 @@ def test_backoff_uses_locked_tick_for_seed(monkeypatch): assert len(recorded_seeds) == 2 assert len(set(recorded_seeds)) == 2, f"Expected unique seeds, got {recorded_seeds}" + + +def _zai_overload_error(): + return SimpleNamespace( + status_code=429, + body={ + "error": { + "code": "1305", + "message": "The service may be temporarily overloaded, please try again later", + } + }, + ) + + +def test_zai_coding_overload_classifier_is_narrow(): + err = _zai_overload_error() + assert is_zai_coding_overload_error( + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.2", + error=err, + ) + + assert not is_zai_coding_overload_error( + base_url="https://api.z.ai/api/paas/v4", + model="glm-5.2", + error=err, + ) + assert not is_zai_coding_overload_error( + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.1", + error=err, + ) + assert not is_zai_coding_overload_error( + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.2", + error=SimpleNamespace(status_code=429, body={"error": {"code": "1113", "message": "Insufficient balance"}}), + ) + + +def test_zai_coding_overload_backoff_keeps_first_retries_short(monkeypatch): + monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"]) + err = _zai_overload_error() + + wait, policy = adaptive_rate_limit_backoff( + 1, + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.2", + error=err, + default_wait=2.5, + ) + assert wait == 2.5 + assert policy == "zai_coding_overload_short" + + wait, policy = adaptive_rate_limit_backoff( + 3, + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.2", + error=err, + default_wait=9.0, + ) + assert wait == 9.0 + assert policy == "zai_coding_overload_short" + + +def test_zai_coding_overload_backoff_grows_after_short_retries(monkeypatch): + monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"]) + err = _zai_overload_error() + + waits = [] + for attempt in range(4, 10): + wait, policy = adaptive_rate_limit_backoff( + attempt, + base_url="https://api.z.ai/api/coding/paas/v4", + model="glm-5.2", + error=err, + default_wait=10.0, + ) + waits.append(wait) + assert policy == "zai_coding_overload_long" + + assert waits == [30.0, 60.0, 90.0, 120.0, 120.0, 120.0] + + +def test_non_zai_backoff_returns_default_wait(): + wait, policy = adaptive_rate_limit_backoff( + 10, + base_url="https://openrouter.ai/api/v1", + model="glm-5.2", + error=_zai_overload_error(), + default_wait=12.0, + ) + assert wait == 12.0 + assert policy is None