mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix: handle named custom providers and Z.AI overload retries
This commit is contained in:
parent
736e981abf
commit
6f2b2a1f34
5 changed files with 237 additions and 5 deletions
|
|
@ -106,7 +106,12 @@ def _custom_provider_extra_body_for_agent(
|
|||
base_url: str,
|
||||
custom_providers: List[Dict[str, Any]],
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
if (provider or "").strip().lower() != "custom":
|
||||
provider_norm = (provider or "").strip().lower()
|
||||
if provider_norm == "custom":
|
||||
provider_key_filter = ""
|
||||
elif provider_norm.startswith("custom:"):
|
||||
provider_key_filter = provider_norm.split(":", 1)[1].strip()
|
||||
else:
|
||||
return None
|
||||
|
||||
target_url = _normalized_custom_base_url(base_url)
|
||||
|
|
@ -117,6 +122,13 @@ def _custom_provider_extra_body_for_agent(
|
|||
for entry in custom_providers or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if provider_key_filter:
|
||||
entry_keys = {
|
||||
str(entry.get("provider_key", "") or "").strip().lower(),
|
||||
str(entry.get("name", "") or "").strip().lower(),
|
||||
}
|
||||
if provider_key_filter not in entry_keys:
|
||||
continue
|
||||
if _normalized_custom_base_url(entry.get("base_url")) != target_url:
|
||||
continue
|
||||
extra_body = entry.get("extra_body")
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ from agent.model_metadata import (
|
|||
)
|
||||
from agent.process_bootstrap import _install_safe_stdio
|
||||
from agent.prompt_caching import apply_anthropic_cache_control
|
||||
from agent.retry_utils import jittered_backoff
|
||||
from agent.retry_utils import adaptive_rate_limit_backoff, jittered_backoff
|
||||
from agent.trajectory import has_incomplete_scratchpad
|
||||
from agent.usage_pricing import estimate_usage_cost, normalize_usage
|
||||
from hermes_constants import PARTIAL_STREAM_STUB_ID
|
||||
|
|
@ -3537,16 +3537,38 @@ def run_conversation(
|
|||
except (TypeError, ValueError):
|
||||
pass
|
||||
wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
|
||||
_backoff_policy = None
|
||||
if is_rate_limited and not _retry_after:
|
||||
wait_time, _backoff_policy = adaptive_rate_limit_backoff(
|
||||
retry_count,
|
||||
base_url=str(_base),
|
||||
model=_model,
|
||||
error=api_error,
|
||||
default_wait=wait_time,
|
||||
)
|
||||
if is_rate_limited:
|
||||
agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
|
||||
_policy_note = ""
|
||||
if _backoff_policy == "zai_coding_overload_long":
|
||||
_policy_note = " (Z.AI Coding overload adaptive long backoff)"
|
||||
elif _backoff_policy == "zai_coding_overload_short":
|
||||
_policy_note = " (Z.AI Coding overload short retry)"
|
||||
_rate_limit_status = f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries}){_policy_note}..."
|
||||
# Normal retries are buffered to avoid noisy transient chatter. Long
|
||||
# Z.AI Coding waits are different: they can last minutes, so surface
|
||||
# progress immediately instead of making the TUI look frozen.
|
||||
if _backoff_policy == "zai_coding_overload_long":
|
||||
agent._emit_status(_rate_limit_status)
|
||||
else:
|
||||
agent._buffer_status(_rate_limit_status)
|
||||
else:
|
||||
agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
|
||||
logger.warning(
|
||||
"Retrying API call in %ss (attempt %s/%s) %s error=%s",
|
||||
"Retrying API call in %ss (attempt %s/%s) %s policy=%s error=%s",
|
||||
wait_time,
|
||||
retry_count,
|
||||
max_retries,
|
||||
agent._client_log_context(),
|
||||
_backoff_policy or "default",
|
||||
api_error,
|
||||
)
|
||||
# Sleep in small increments so we can respond to interrupts quickly
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ rate-limited provider concurrently.
|
|||
import random
|
||||
import threading
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
# Monotonic counter for jitter seed uniqueness within the same process.
|
||||
# Protected by a lock to avoid race conditions in concurrent retry paths
|
||||
|
|
@ -15,6 +16,14 @@ import time
|
|||
_jitter_counter = 0
|
||||
_jitter_lock = threading.Lock()
|
||||
|
||||
# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
|
||||
# ("The service may be temporarily overloaded...") for otherwise valid
|
||||
# Hermes requests. Short retries tend to hammer the same overloaded window;
|
||||
# after a few normal retries, progressively widen the wait window. Keep the
|
||||
# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
|
||||
# not sit silent for 20+ minutes.
|
||||
_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
|
||||
|
||||
|
||||
def jittered_backoff(
|
||||
attempt: int,
|
||||
|
|
@ -55,3 +64,66 @@ def jittered_backoff(
|
|||
jitter = rng.uniform(0, jitter_ratio * delay)
|
||||
|
||||
return delay + jitter
|
||||
|
||||
|
||||
def _error_text(error: Any) -> str:
|
||||
"""Best-effort flattened provider error text for retry classification."""
|
||||
parts = [
|
||||
error,
|
||||
getattr(error, "message", None),
|
||||
getattr(error, "body", None),
|
||||
getattr(error, "response", None),
|
||||
]
|
||||
return " ".join(str(part) for part in parts if part is not None).lower()
|
||||
|
||||
|
||||
def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
|
||||
"""Return True for Z.AI Coding Plan transient overload 429s.
|
||||
|
||||
The coding-plan endpoint reports overload as HTTP 429 with body code 1305
|
||||
and message "The service may be temporarily overloaded...". Treat only
|
||||
that narrow shape specially so ordinary quota/billing 429s still fail fast
|
||||
through the existing classifier.
|
||||
"""
|
||||
base = (base_url or "").lower()
|
||||
model_name = (model or "").lower()
|
||||
status = getattr(error, "status_code", None)
|
||||
text = _error_text(error)
|
||||
return (
|
||||
status == 429
|
||||
and "api.z.ai/api/coding/paas/v4" in base
|
||||
and "glm-5.2" in model_name
|
||||
and ("1305" in text or "temporarily overloaded" in text)
|
||||
)
|
||||
|
||||
|
||||
def adaptive_rate_limit_backoff(
|
||||
attempt: int,
|
||||
*,
|
||||
base_url: str | None,
|
||||
model: str | None,
|
||||
error: Any,
|
||||
default_wait: float,
|
||||
short_attempts: int = 3,
|
||||
) -> tuple[float, str | None]:
|
||||
"""Provider-aware rate-limit backoff.
|
||||
|
||||
For most providers this returns ``default_wait`` unchanged. For Z.AI
|
||||
Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
|
||||
the normal short exponential schedule, then switch to progressively longer
|
||||
waits (30s → 60s → 90s → 120s, capped) plus light jitter.
|
||||
|
||||
``attempt`` is 1-based, matching the retry loop's logged attempt number.
|
||||
Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
|
||||
for status/log decoration when a provider-specific policy fired.
|
||||
"""
|
||||
if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
|
||||
return default_wait, None
|
||||
if attempt <= short_attempts:
|
||||
return default_wait, "zai_coding_overload_short"
|
||||
|
||||
idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
|
||||
base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
|
||||
# A smaller jitter ratio keeps long waits readable while still avoiding
|
||||
# synchronized retry storms across concurrent Hermes sessions.
|
||||
return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"
|
||||
|
|
|
|||
|
|
@ -91,3 +91,34 @@ def test_custom_provider_extra_body_ignores_other_custom_models():
|
|||
)
|
||||
|
||||
assert agent.request_overrides == {}
|
||||
|
||||
|
||||
def test_named_custom_provider_extra_body_matches_provider_key():
|
||||
agent = SimpleNamespace(
|
||||
provider="custom:zai-coding-plan",
|
||||
model="glm-5.2",
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
request_overrides={},
|
||||
)
|
||||
|
||||
_merge_custom_provider_extra_body(
|
||||
agent,
|
||||
[
|
||||
{
|
||||
"provider_key": "other-provider",
|
||||
"name": "Other Provider",
|
||||
"base_url": "https://api.z.ai/api/coding/paas/v4",
|
||||
"model": "glm-5.2",
|
||||
"extra_body": {"enable_thinking": True},
|
||||
},
|
||||
{
|
||||
"provider_key": "zai-coding-plan",
|
||||
"name": "Z.AI Coding Plan",
|
||||
"base_url": "https://api.z.ai/api/coding/paas/v4/",
|
||||
"model": "glm-5.2",
|
||||
"extra_body": {"enable_thinking": False},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
assert agent.request_overrides == {"extra_body": {"enable_thinking": False}}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,9 @@
|
|||
import threading
|
||||
|
||||
import agent.retry_utils as retry_utils
|
||||
from agent.retry_utils import jittered_backoff
|
||||
from types import SimpleNamespace
|
||||
|
||||
from agent.retry_utils import adaptive_rate_limit_backoff, is_zai_coding_overload_error, jittered_backoff
|
||||
|
||||
|
||||
def test_backoff_is_exponential():
|
||||
|
|
@ -115,3 +117,96 @@ def test_backoff_uses_locked_tick_for_seed(monkeypatch):
|
|||
|
||||
assert len(recorded_seeds) == 2
|
||||
assert len(set(recorded_seeds)) == 2, f"Expected unique seeds, got {recorded_seeds}"
|
||||
|
||||
|
||||
def _zai_overload_error():
|
||||
return SimpleNamespace(
|
||||
status_code=429,
|
||||
body={
|
||||
"error": {
|
||||
"code": "1305",
|
||||
"message": "The service may be temporarily overloaded, please try again later",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_zai_coding_overload_classifier_is_narrow():
|
||||
err = _zai_overload_error()
|
||||
assert is_zai_coding_overload_error(
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=err,
|
||||
)
|
||||
|
||||
assert not is_zai_coding_overload_error(
|
||||
base_url="https://api.z.ai/api/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=err,
|
||||
)
|
||||
assert not is_zai_coding_overload_error(
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.1",
|
||||
error=err,
|
||||
)
|
||||
assert not is_zai_coding_overload_error(
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=SimpleNamespace(status_code=429, body={"error": {"code": "1113", "message": "Insufficient balance"}}),
|
||||
)
|
||||
|
||||
|
||||
def test_zai_coding_overload_backoff_keeps_first_retries_short(monkeypatch):
|
||||
monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
|
||||
err = _zai_overload_error()
|
||||
|
||||
wait, policy = adaptive_rate_limit_backoff(
|
||||
1,
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=err,
|
||||
default_wait=2.5,
|
||||
)
|
||||
assert wait == 2.5
|
||||
assert policy == "zai_coding_overload_short"
|
||||
|
||||
wait, policy = adaptive_rate_limit_backoff(
|
||||
3,
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=err,
|
||||
default_wait=9.0,
|
||||
)
|
||||
assert wait == 9.0
|
||||
assert policy == "zai_coding_overload_short"
|
||||
|
||||
|
||||
def test_zai_coding_overload_backoff_grows_after_short_retries(monkeypatch):
|
||||
monkeypatch.setattr(retry_utils, "jittered_backoff", lambda *a, **kw: kw["base_delay"])
|
||||
err = _zai_overload_error()
|
||||
|
||||
waits = []
|
||||
for attempt in range(4, 10):
|
||||
wait, policy = adaptive_rate_limit_backoff(
|
||||
attempt,
|
||||
base_url="https://api.z.ai/api/coding/paas/v4",
|
||||
model="glm-5.2",
|
||||
error=err,
|
||||
default_wait=10.0,
|
||||
)
|
||||
waits.append(wait)
|
||||
assert policy == "zai_coding_overload_long"
|
||||
|
||||
assert waits == [30.0, 60.0, 90.0, 120.0, 120.0, 120.0]
|
||||
|
||||
|
||||
def test_non_zai_backoff_returns_default_wait():
|
||||
wait, policy = adaptive_rate_limit_backoff(
|
||||
10,
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model="glm-5.2",
|
||||
error=_zai_overload_error(),
|
||||
default_wait=12.0,
|
||||
)
|
||||
assert wait == 12.0
|
||||
assert policy is None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue