fix: handle named custom providers and Z.AI overload retries

This commit is contained in:
benbenwyb 2026-06-16 19:33:21 +08:00 committed by Teknium
parent 736e981abf
commit 6f2b2a1f34
5 changed files with 237 additions and 5 deletions

View file

@ -106,7 +106,12 @@ def _custom_provider_extra_body_for_agent(
base_url: str,
custom_providers: List[Dict[str, Any]],
) -> Optional[Dict[str, Any]]:
if (provider or "").strip().lower() != "custom":
provider_norm = (provider or "").strip().lower()
if provider_norm == "custom":
provider_key_filter = ""
elif provider_norm.startswith("custom:"):
provider_key_filter = provider_norm.split(":", 1)[1].strip()
else:
return None
target_url = _normalized_custom_base_url(base_url)
@ -117,6 +122,13 @@ def _custom_provider_extra_body_for_agent(
for entry in custom_providers or []:
if not isinstance(entry, dict):
continue
if provider_key_filter:
entry_keys = {
str(entry.get("provider_key", "") or "").strip().lower(),
str(entry.get("name", "") or "").strip().lower(),
}
if provider_key_filter not in entry_keys:
continue
if _normalized_custom_base_url(entry.get("base_url")) != target_url:
continue
extra_body = entry.get("extra_body")

View file

@ -55,7 +55,7 @@ from agent.model_metadata import (
)
from agent.process_bootstrap import _install_safe_stdio
from agent.prompt_caching import apply_anthropic_cache_control
from agent.retry_utils import jittered_backoff
from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff
from agent.trajectory import has_incomplete_scratchpad
from agent.usage_pricing import estimate_usage_cost, normalize_usage
from hermes_constants import PARTIAL_STREAM_STUB_ID
@ -3537,16 +3537,38 @@ def run_conversation(
except (TypeError, ValueError):
pass
wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
_backoff_policy = None
if is_rate_limited and not _retry_after:
wait_time, _backoff_policy = adaptive_rate_limit_backoff(
retry_count,
base_url=str(_base),
model=_model,
error=api_error,
default_wait=wait_time,
)
if is_rate_limited:
agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
_policy_note = ""
if _backoff_policy == "zai_coding_overload_long":
_policy_note = " (Z.AI Coding overload adaptive long backoff)"
elif _backoff_policy == "zai_coding_overload_short":
_policy_note = " (Z.AI Coding overload short retry)"
_rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..."
# Normal retries are buffered to avoid noisy transient chatter. Long
# Z.AI Coding waits are different: they can last minutes, so surface
# progress immediately instead of making the TUI look frozen.
if _backoff_policy == "zai_coding_overload_long":
agent._emit_status(_rate_limit_status)
else:
agent._buffer_status(_rate_limit_status)
else:
agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
logger.warning(
"Retrying API call in %ss (attempt %s/%s) %s error=%s",
"Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s",
wait_time,
retry_count,
max_retries,
agent._client_log_context(),
_backoff_policy or "default",
api_error,
)
# Sleep in small increments so we can respond to interrupts quickly

View file

@ -8,6 +8,7 @@ rate-limited provider concurrently.
import random
import threading
import time
from typing import Any
# Monotonic counter for jitter seed uniqueness within the same process.
# Protected by a lock to avoid race conditions in concurrent retry paths
@ -15,6 +16,14 @@ import time
_jitter_counter = 0
_jitter_lock = threading.Lock()
# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
# ("The service may be temporarily overloaded...") for otherwise valid
# Hermes requests. Short retries tend to hammer the same overloaded window;
# after a few normal retries, progressively widen the wait window. Keep the
# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
# not sit silent for 20+ minutes.
_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
def jittered_backoff(
attempt: int,
@ -55,3 +64,66 @@ def jittered_backoff(
jitter = rng.uniform(0, jitter_ratio * delay)
return delay + jitter
def _error_text(error: Any) -> str:
"""Best-effort flattened provider error text for retry classification."""
parts = [
error,
getattr(error, "message", None),
getattr(error, "body", None),
getattr(error, "response", None),
]
return " ".join(str(part) for part in parts if part is not None).lower()
def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
"""Return True for Z.AI Coding Plan transient overload 429s.
The coding-plan endpoint reports overload as HTTP 429 with body code 1305
and message "The service may be temporarily overloaded...". Treat only
that narrow shape specially so ordinary quota/billing 429s still fail fast
through the existing classifier.
"""
base = (base_url or "").lower()
model_name = (model or "").lower()
status = getattr(error, "status_code", None)
text = _error_text(error)
return (
status == 429
and "api.z.ai/api/coding/paas/v4" in base
and "glm-5.2" in model_name
and ("1305" in text or "temporarily overloaded" in text)
)
def adaptive_rate_limit_backoff(
attempt: int,
*,
base_url: str | None,
model: str | None,
error: Any,
default_wait: float,
short_attempts: int = 3,
) -> tuple[float, str | None]:
"""Provider-aware rate-limit backoff.
For most providers this returns ``default_wait`` unchanged. For Z.AI
Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
the normal short exponential schedule, then switch to progressively longer
waits (30s 60s 90s 120s, capped) plus light jitter.
``attempt`` is 1-based, matching the retry loop's logged attempt number.
Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
for status/log decoration when a provider-specific policy fired.
"""
if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
return default_wait, None
if attempt <= short_attempts:
return default_wait, "zai_coding_overload_short"
idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
# A smaller jitter ratio keeps long waits readable while still avoiding
# synchronized retry storms across concurrent Hermes sessions.
return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"

View file

@ -91,3 +91,34 @@ def test_custom_provider_extra_body_ignores_other_custom_models():
)
assert agent.request_overrides == {}
def test_named_custom_provider_extra_body_matches_provider_key():
agent = SimpleNamespace(
provider="custom:zai-coding-plan",
model="glm-5.2",
base_url="https://api.z.ai/api/coding/paas/v4",
request_overrides={},
)
_merge_custom_provider_extra_body(
agent,
[
{
"provider_key": "other-provider",
"name": "Other Provider",
"base_url": "https://api.z.ai/api/coding/paas/v4",
"model": "glm-5.2",
"extra_body": {"enable_thinking": True},
},
{
"provider_key": "zai-coding-plan",
"name": "Z.AI Coding Plan",
"base_url": "https://api.z.ai/api/coding/paas/v4/",
"model": "glm-5.2",
"extra_body": {"enable_thinking": False},
},
],
)
assert agent.request_overrides == {"extra_body": {"enable_thinking": False}}

View file

@ -3,7 +3,9 @@
import threading
import agent.retry_utils as retry_utils
from agent.retry_utils import jittered_backoff
from types import SimpleNamespace
from agent.retry_utils import adaptive_rate_limit_backoff, is_zai_coding_overload_error, jittered_backoff
def test_backoff_is_exponential():
@ -115,3 +117,96 @@ def test_backoff_uses_locked_tick_for_seed(monkeypatch):
assert len(recorded_seeds) == 2
assert len(set(recorded_seeds)) == 2, f"Expected unique seeds, got {recorded_seeds}"
def _zai_overload_error():
return SimpleNamespace(
status_code=429,
body={
"error": {
"code": "1305",
"message": "The service may be temporarily overloaded, please try again later",
}
},
)
def test_zai_coding_overload_classifier_is_narrow():
err = _zai_overload_error()
assert is_zai_coding_overload_error(
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.2",
error=err,
)
assert not is_zai_coding_overload_error(
base_url="https://api.z.ai/api/paas/v4",
model="glm-5.2",
error=err,
)
assert not is_zai_coding_overload_error(
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.1",
error=err,
)
assert not is_zai_coding_overload_error(
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.2",
error=SimpleNamespace(status_code=429, body={"error": {"code": "1113", "message": "Insufficient balance"}}),
)
def test_zai_coding_overload_backoff_keeps_first_retries_short(monkeypatch):
monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
err = _zai_overload_error()
wait, policy = adaptive_rate_limit_backoff(
1,
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.2",
error=err,
default_wait=2.5,
)
assert wait == 2.5
assert policy == "zai_coding_overload_short"
wait, policy = adaptive_rate_limit_backoff(
3,
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.2",
error=err,
default_wait=9.0,
)
assert wait == 9.0
assert policy == "zai_coding_overload_short"
def test_zai_coding_overload_backoff_grows_after_short_retries(monkeypatch):
monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
err = _zai_overload_error()
waits = []
for attempt in range(4, 10):
wait, policy = adaptive_rate_limit_backoff(
attempt,
base_url="https://api.z.ai/api/coding/paas/v4",
model="glm-5.2",
error=err,
default_wait=10.0,
)
waits.append(wait)
assert policy == "zai_coding_overload_long"
assert waits == [30.0, 60.0, 90.0, 120.0, 120.0, 120.0]
def test_non_zai_backoff_returns_default_wait():
wait, policy = adaptive_rate_limit_backoff(
10,
base_url="https://openrouter.ai/api/v1",
model="glm-5.2",
error=_zai_overload_error(),
default_wait=12.0,
)
assert wait == 12.0
assert policy is None