From 6f2b2a1f34d1dbeaf67a02e7212485f0f77ed116 Mon Sep 17 00:00:00 2001
From: benbenwyb <benbenwyb@gmail.com>
Date: Tue, 16 Jun 2026 19:33:21 +0800
Subject: [PATCH] fix: handle named custom providers and Z.AI overload retries

---
 agent/agent_init.py                           | 14 ++-
 agent/conversation_loop.py                    | 28 +++++-
 agent/retry_utils.py                          | 72 ++++++++++++++
 .../agent/test_custom_provider_extra_body.py  | 31 ++++++
 tests/test_retry_utils.py                     | 97 ++++++++++++++++++-
 5 files changed, 237 insertions(+), 5 deletions(-)

diff --git a/agent/agent_init.py b/agent/agent_init.py
index 065299be05a..4436daa7842 100644
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -106,7 +106,12 @@ def _custom_provider_extra_body_for_agent(
     base_url: str,
     custom_providers: List[Dict[str, Any]],
 ) -> Optional[Dict[str, Any]]:
-    if (provider or "").strip().lower() != "custom":
+    provider_norm = (provider or "").strip().lower()
+    if provider_norm == "custom":
+        provider_key_filter = ""
+    elif provider_norm.startswith("custom:"):
+        provider_key_filter = provider_norm.split(":", 1)[1].strip()
+    else:
         return None
 
     target_url = _normalized_custom_base_url(base_url)
@@ -117,6 +122,13 @@ def _custom_provider_extra_body_for_agent(
     for entry in custom_providers or []:
         if not isinstance(entry, dict):
             continue
+        if provider_key_filter:
+            entry_keys = {
+                str(entry.get("provider_key", "") or "").strip().lower(),
+                str(entry.get("name", "") or "").strip().lower(),
+            }
+            if provider_key_filter not in entry_keys:
+                continue
         if _normalized_custom_base_url(entry.get("base_url")) != target_url:
             continue
         extra_body = entry.get("extra_body")
diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index b647e39c980..3526029ddab 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -55,7 +55,7 @@ from agent.model_metadata import (
 )
 from agent.process_bootstrap import _install_safe_stdio
 from agent.prompt_caching import apply_anthropic_cache_control
-from agent.retry_utils import jittered_backoff
+from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff
 from agent.trajectory import has_incomplete_scratchpad
 from agent.usage_pricing import estimate_usage_cost, normalize_usage
 from hermes_constants import PARTIAL_STREAM_STUB_ID
@@ -3537,16 +3537,38 @@ def run_conversation(
                             except (TypeError, ValueError):
                                 pass
                 wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
+                _backoff_policy = None
+                if is_rate_limited and not _retry_after:
+                    wait_time, _backoff_policy = adaptive_rate_limit_backoff(
+                        retry_count,
+                        base_url=str(_base),
+                        model=_model,
+                        error=api_error,
+                        default_wait=wait_time,
+                    )
                 if is_rate_limited:
-                    agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
+                    _policy_note = ""
+                    if _backoff_policy == "zai_coding_overload_long":
+                        _policy_note = " (Z.AI Coding overload adaptive long backoff)"
+                    elif _backoff_policy == "zai_coding_overload_short":
+                        _policy_note = " (Z.AI Coding overload short retry)"
+                    _rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..."
+                    # Normal retries are buffered to avoid noisy transient chatter. Long
+                    # Z.AI Coding waits are different: they can last minutes, so surface
+                    # progress immediately instead of making the TUI look frozen.
+                    if _backoff_policy == "zai_coding_overload_long":
+                        agent._emit_status(_rate_limit_status)
+                    else:
+                        agent._buffer_status(_rate_limit_status)
                 else:
                     agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
                 logger.warning(
-                    "Retrying API call in %ss (attempt %s/%s) %s error=%s",
+                    "Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s",
                     wait_time,
                     retry_count,
                     max_retries,
                     agent._client_log_context(),
+                    _backoff_policy or "default",
                     api_error,
                 )
                 # Sleep in small increments so we can respond to interrupts quickly
diff --git a/agent/retry_utils.py b/agent/retry_utils.py
index 71d6963f7b4..2922156847b 100644
--- a/agent/retry_utils.py
+++ b/agent/retry_utils.py
@@ -8,6 +8,7 @@ rate-limited provider concurrently.
 import random
 import threading
 import time
+from typing import Any
 
 # Monotonic counter for jitter seed uniqueness within the same process.
 # Protected by a lock to avoid race conditions in concurrent retry paths
@@ -15,6 +16,14 @@ import time
 _jitter_counter = 0
 _jitter_lock = threading.Lock()
 
+# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
+# ("The service may be temporarily overloaded...") for otherwise valid
+# Hermes requests. Short retries tend to hammer the same overloaded window;
+# after a few normal retries, progressively widen the wait window. Keep the
+# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
+# not sit silent for 20+ minutes.
+_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
+
 
 def jittered_backoff(
     attempt: int,
@@ -55,3 +64,66 @@ def jittered_backoff(
     jitter = rng.uniform(0, jitter_ratio * delay)
 
     return delay + jitter
+
+
+def _error_text(error: Any) -> str:
+    """Best-effort flattened provider error text for retry classification."""
+    parts = [
+        error,
+        getattr(error, "message", None),
+        getattr(error, "body", None),
+        getattr(error, "response", None),
+    ]
+    return " ".join(str(part) for part in parts if part is not None).lower()
+
+
+def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
+    """Return True for Z.AI Coding Plan transient overload 429s.
+
+    The coding-plan endpoint reports overload as HTTP 429 with body code 1305
+    and message "The service may be temporarily overloaded...". Treat only
+    that narrow shape specially so ordinary quota/billing 429s still fail fast
+    through the existing classifier.
+    """
+    base = (base_url or "").lower()
+    model_name = (model or "").lower()
+    status = getattr(error, "status_code", None)
+    text = _error_text(error)
+    return (
+        status == 429
+        and "api.z.ai/api/coding/paas/v4" in base
+        and "glm-5.2" in model_name
+        and ("1305" in text or "temporarily overloaded" in text)
+    )
+
+
+def adaptive_rate_limit_backoff(
+    attempt: int,
+    *,
+    base_url: str | None,
+    model: str | None,
+    error: Any,
+    default_wait: float,
+    short_attempts: int = 3,
+) -> tuple[float, str | None]:
+    """Provider-aware rate-limit backoff.
+
+    For most providers this returns ``default_wait`` unchanged. For Z.AI
+    Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
+    the normal short exponential schedule, then switch to progressively longer
+    waits (30s → 60s → 90s → 120s, capped) plus light jitter.
+
+    ``attempt`` is 1-based, matching the retry loop's logged attempt number.
+    Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
+    for status/log decoration when a provider-specific policy fired.
+    """
+    if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
+        return default_wait, None
+    if attempt <= short_attempts:
+        return default_wait, "zai_coding_overload_short"
+
+    idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
+    base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
+    # A smaller jitter ratio keeps long waits readable while still avoiding
+    # synchronized retry storms across concurrent Hermes sessions.
+    return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"
diff --git a/tests/agent/test_custom_provider_extra_body.py b/tests/agent/test_custom_provider_extra_body.py
index 23556ae62de..a3a1015557a 100644
--- a/tests/agent/test_custom_provider_extra_body.py
+++ b/tests/agent/test_custom_provider_extra_body.py
@@ -91,3 +91,34 @@ def test_custom_provider_extra_body_ignores_other_custom_models():
     )
 
     assert agent.request_overrides == {}
+
+
+def test_named_custom_provider_extra_body_matches_provider_key():
+    agent = SimpleNamespace(
+        provider="custom:zai-coding-plan",
+        model="glm-5.2",
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        request_overrides={},
+    )
+
+    _merge_custom_provider_extra_body(
+        agent,
+        [
+            {
+                "provider_key": "other-provider",
+                "name": "Other Provider",
+                "base_url": "https://api.z.ai/api/coding/paas/v4",
+                "model": "glm-5.2",
+                "extra_body": {"enable_thinking": True},
+            },
+            {
+                "provider_key": "zai-coding-plan",
+                "name": "Z.AI Coding Plan",
+                "base_url": "https://api.z.ai/api/coding/paas/v4/",
+                "model": "glm-5.2",
+                "extra_body": {"enable_thinking": False},
+            },
+        ],
+    )
+
+    assert agent.request_overrides == {"extra_body": {"enable_thinking": False}}
diff --git a/tests/test_retry_utils.py b/tests/test_retry_utils.py
index f39c3142d9f..ff08d3a4062 100644
--- a/tests/test_retry_utils.py
+++ b/tests/test_retry_utils.py
@@ -3,7 +3,9 @@
 import threading
 
 import agent.retry_utils as retry_utils
-from agent.retry_utils import jittered_backoff
+from types import SimpleNamespace
+
+from agent.retry_utils import adaptive_rate_limit_backoff, is_zai_coding_overload_error, jittered_backoff
 
 
 def test_backoff_is_exponential():
@@ -115,3 +117,96 @@ def test_backoff_uses_locked_tick_for_seed(monkeypatch):
 
     assert len(recorded_seeds) == 2
     assert len(set(recorded_seeds)) == 2, f"Expected unique seeds, got {recorded_seeds}"
+
+
+def _zai_overload_error():
+    return SimpleNamespace(
+        status_code=429,
+        body={
+            "error": {
+                "code": "1305",
+                "message": "The service may be temporarily overloaded, please try again later",
+            }
+        },
+    )
+
+
+def test_zai_coding_overload_classifier_is_narrow():
+    err = _zai_overload_error()
+    assert is_zai_coding_overload_error(
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        model="glm-5.2",
+        error=err,
+    )
+
+    assert not is_zai_coding_overload_error(
+        base_url="https://api.z.ai/api/paas/v4",
+        model="glm-5.2",
+        error=err,
+    )
+    assert not is_zai_coding_overload_error(
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        model="glm-5.1",
+        error=err,
+    )
+    assert not is_zai_coding_overload_error(
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        model="glm-5.2",
+        error=SimpleNamespace(status_code=429, body={"error": {"code": "1113", "message": "Insufficient balance"}}),
+    )
+
+
+def test_zai_coding_overload_backoff_keeps_first_retries_short(monkeypatch):
+    monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
+    err = _zai_overload_error()
+
+    wait, policy = adaptive_rate_limit_backoff(
+        1,
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        model="glm-5.2",
+        error=err,
+        default_wait=2.5,
+    )
+    assert wait == 2.5
+    assert policy == "zai_coding_overload_short"
+
+    wait, policy = adaptive_rate_limit_backoff(
+        3,
+        base_url="https://api.z.ai/api/coding/paas/v4",
+        model="glm-5.2",
+        error=err,
+        default_wait=9.0,
+    )
+    assert wait == 9.0
+    assert policy == "zai_coding_overload_short"
+
+
+def test_zai_coding_overload_backoff_grows_after_short_retries(monkeypatch):
+    monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
+    err = _zai_overload_error()
+
+    waits = []
+    for attempt in range(4, 10):
+        wait, policy = adaptive_rate_limit_backoff(
+            attempt,
+            base_url="https://api.z.ai/api/coding/paas/v4",
+            model="glm-5.2",
+            error=err,
+            default_wait=10.0,
+        )
+        waits.append(wait)
+        assert policy == "zai_coding_overload_long"
+
+    assert waits == [30.0, 60.0, 90.0, 120.0, 120.0, 120.0]
+
+
+def test_non_zai_backoff_returns_default_wait():
+    wait, policy = adaptive_rate_limit_backoff(
+        10,
+        base_url="https://openrouter.ai/api/v1",
+        model="glm-5.2",
+        error=_zai_overload_error(),
+        default_wait=12.0,
+    )
+    assert wait == 12.0
+    assert policy is None