mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
129 lines
4.7 KiB
Python
129 lines
4.7 KiB
Python
"""Retry utilities — jittered backoff for decorrelated retries.
|
|
|
|
Replaces fixed exponential backoff with jittered delays to prevent
|
|
thundering-herd retry spikes when multiple sessions hit the same
|
|
rate-limited provider concurrently.
|
|
"""
|
|
|
|
import random
|
|
import threading
|
|
import time
|
|
from typing import Any
|
|
|
|
# Monotonic counter for jitter seed uniqueness within the same process.
|
|
# Protected by a lock to avoid race conditions in concurrent retry paths
|
|
# (e.g. multiple gateway sessions retrying simultaneously).
|
|
_jitter_counter = 0
|
|
_jitter_lock = threading.Lock()
|
|
|
|
# Z.AI Coding Plan's GLM-5.2 endpoint often returns HTTP 429 code 1305
|
|
# ("The service may be temporarily overloaded...") for otherwise valid
|
|
# Hermes requests. Short retries tend to hammer the same overloaded window;
|
|
# after a few normal retries, progressively widen the wait window. Keep the
|
|
# cap interactive-friendly: a simple TUI message should fail visibly in minutes,
|
|
# not sit silent for 20+ minutes.
|
|
_ZAI_CODING_OVERLOAD_LONG_BACKOFF = (30.0, 60.0, 90.0, 120.0)
|
|
|
|
|
|
def jittered_backoff(
|
|
attempt: int,
|
|
*,
|
|
base_delay: float = 5.0,
|
|
max_delay: float = 120.0,
|
|
jitter_ratio: float = 0.5,
|
|
) -> float:
|
|
"""Compute a jittered exponential backoff delay.
|
|
|
|
Args:
|
|
attempt: 1-based retry attempt number.
|
|
base_delay: Base delay in seconds for attempt 1.
|
|
max_delay: Maximum delay cap in seconds.
|
|
jitter_ratio: Fraction of computed delay to use as random jitter
|
|
range. 0.5 means jitter is uniform in [0, 0.5 * delay].
|
|
|
|
Returns:
|
|
Delay in seconds: min(base * 2^(attempt-1), max_delay) + jitter.
|
|
|
|
The jitter decorrelates concurrent retries so multiple sessions
|
|
hitting the same provider don't all retry at the same instant.
|
|
"""
|
|
global _jitter_counter
|
|
with _jitter_lock:
|
|
_jitter_counter += 1
|
|
tick = _jitter_counter
|
|
|
|
exponent = max(0, attempt - 1)
|
|
if exponent >= 63 or base_delay <= 0:
|
|
delay = max_delay
|
|
else:
|
|
delay = min(base_delay * (2 ** exponent), max_delay)
|
|
|
|
# Seed from time + counter for decorrelation even with coarse clocks.
|
|
seed = (time.time_ns() ^ (tick * 0x9E3779B9)) & 0xFFFFFFFF
|
|
rng = random.Random(seed)
|
|
jitter = rng.uniform(0, jitter_ratio * delay)
|
|
|
|
return delay + jitter
|
|
|
|
|
|
def _error_text(error: Any) -> str:
|
|
"""Best-effort flattened provider error text for retry classification."""
|
|
parts = [
|
|
error,
|
|
getattr(error, "message", None),
|
|
getattr(error, "body", None),
|
|
getattr(error, "response", None),
|
|
]
|
|
return " ".join(str(part) for part in parts if part is not None).lower()
|
|
|
|
|
|
def is_zai_coding_overload_error(*, base_url: str | None, model: str | None, error: Any) -> bool:
|
|
"""Return True for Z.AI Coding Plan transient overload 429s.
|
|
|
|
The coding-plan endpoint reports overload as HTTP 429 with body code 1305
|
|
and message "The service may be temporarily overloaded...". Treat only
|
|
that narrow shape specially so ordinary quota/billing 429s still fail fast
|
|
through the existing classifier.
|
|
"""
|
|
base = (base_url or "").lower()
|
|
model_name = (model or "").lower()
|
|
status = getattr(error, "status_code", None)
|
|
text = _error_text(error)
|
|
return (
|
|
status == 429
|
|
and "api.z.ai/api/coding/paas/v4" in base
|
|
and "glm-5.2" in model_name
|
|
and ("1305" in text or "temporarily overloaded" in text)
|
|
)
|
|
|
|
|
|
def adaptive_rate_limit_backoff(
|
|
attempt: int,
|
|
*,
|
|
base_url: str | None,
|
|
model: str | None,
|
|
error: Any,
|
|
default_wait: float,
|
|
short_attempts: int = 3,
|
|
) -> tuple[float, str | None]:
|
|
"""Provider-aware rate-limit backoff.
|
|
|
|
For most providers this returns ``default_wait`` unchanged. For Z.AI
|
|
Coding Plan GLM-5.2 overloads, keep the first ``short_attempts`` retries on
|
|
the normal short exponential schedule, then switch to progressively longer
|
|
waits (30s → 60s → 90s → 120s, capped) plus light jitter.
|
|
|
|
``attempt`` is 1-based, matching the retry loop's logged attempt number.
|
|
Returns ``(wait_seconds, reason_label)`` where ``reason_label`` is suitable
|
|
for status/log decoration when a provider-specific policy fired.
|
|
"""
|
|
if not is_zai_coding_overload_error(base_url=base_url, model=model, error=error):
|
|
return default_wait, None
|
|
if attempt <= short_attempts:
|
|
return default_wait, "zai_coding_overload_short"
|
|
|
|
idx = min(attempt - short_attempts - 1, len(_ZAI_CODING_OVERLOAD_LONG_BACKOFF) - 1)
|
|
base_delay = _ZAI_CODING_OVERLOAD_LONG_BACKOFF[idx]
|
|
# A smaller jitter ratio keeps long waits readable while still avoiding
|
|
# synchronized retry storms across concurrent Hermes sessions.
|
|
return jittered_backoff(1, base_delay=base_delay, max_delay=base_delay, jitter_ratio=0.2), "zai_coding_overload_long"
|