"""API error classification for smart failover and recovery. Provides a structured taxonomy of API errors and a priority-ordered classification pipeline that determines the correct recovery action (retry, rotate credential, fallback to another provider, compress context, or abort). Replaces scattered inline string-matching with a centralized classifier that the main retry loop in run_agent.py consults for every API failure. """ from __future__ import annotations import enum import logging from dataclasses import dataclass, field from typing import Any, Dict, Optional logger = logging.getLogger(__name__) # ── Error taxonomy ────────────────────────────────────────────────────── class FailoverReason(enum.Enum): """Why an API call failed — determines recovery strategy.""" # Authentication / authorization auth = "auth" # Transient auth (401/403) — refresh/rotate auth_permanent = "auth_permanent" # Auth failed after refresh — abort # Billing / quota billing = "billing" # 402 or confirmed credit exhaustion — rotate immediately rate_limit = "rate_limit" # 429 or quota-based throttling — backoff then rotate # Server-side overloaded = "overloaded" # 503/529 — provider overloaded, backoff server_error = "server_error" # 500/502 — internal server error, retry # Transport timeout = "timeout" # Connection/read timeout — rebuild client + retry # Context / payload context_overflow = "context_overflow" # Context too large — compress, not failover payload_too_large = "payload_too_large" # 413 — compress payload # Model model_not_found = "model_not_found" # 404 or invalid model — fallback to different model provider_policy_blocked = "provider_policy_blocked" # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy # Request format format_error = "format_error" # 400 bad request — abort or strip + retry # Provider-specific thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid long_context_tier = "long_context_tier" # Anthropic "extra usage" tier gate # Catch-all unknown = "unknown" # Unclassifiable — retry with backoff # ── Classification result ─────────────────────────────────────────────── @dataclass class ClassifiedError: """Structured classification of an API error with recovery hints.""" reason: FailoverReason status_code: Optional[int] = None provider: Optional[str] = None model: Optional[str] = None message: str = "" error_context: Dict[str, Any] = field(default_factory=dict) # Recovery action hints — the retry loop checks these instead of # re-classifying the error itself. retryable: bool = True should_compress: bool = False should_rotate_credential: bool = False should_fallback: bool = False @property def is_auth(self) -> bool: return self.reason in (FailoverReason.auth, FailoverReason.auth_permanent) # ── Provider-specific patterns ────────────────────────────────────────── # Patterns that indicate billing exhaustion (not transient rate limit) _BILLING_PATTERNS = [ "insufficient credits", "insufficient_quota", "credit balance", "credits have been exhausted", "top up your credits", "payment required", "billing hard limit", "exceeded your current quota", "account is deactivated", "plan does not include", ] # Patterns that indicate rate limiting (transient, will resolve) _RATE_LIMIT_PATTERNS = [ "rate limit", "rate_limit", "too many requests", "throttled", "requests per minute", "tokens per minute", "requests per day", "try again in", "please retry after", "resource_exhausted", "rate increased too quickly", # Alibaba/DashScope throttling # AWS Bedrock throttling "throttlingexception", "too many concurrent requests", "servicequotaexceededexception", ] # Usage-limit patterns that need disambiguation (could be billing OR rate_limit) _USAGE_LIMIT_PATTERNS = [ "usage limit", "quota", "limit exceeded", "key limit exceeded", ] # Patterns confirming usage limit is transient (not billing) _USAGE_LIMIT_TRANSIENT_SIGNALS = [ "try again", "retry", "resets at", "reset in", "wait", "requests remaining", "periodic", "window", ] # Payload-too-large patterns detected from message text (no status_code attr). # Proxies and some backends embed the HTTP status in the error message. _PAYLOAD_TOO_LARGE_PATTERNS = [ "request entity too large", "payload too large", "error code: 413", ] # Context overflow patterns _CONTEXT_OVERFLOW_PATTERNS = [ "context length", "context size", "maximum context", "token limit", "too many tokens", "reduce the length", "exceeds the limit", "context window", "prompt is too long", "prompt exceeds max length", "max_tokens", "maximum number of tokens", # vLLM / local inference server patterns "exceeds the max_model_len", "max_model_len", "prompt length", # "engine prompt length X exceeds" "input is too long", "maximum model length", # Ollama patterns "context length exceeded", "truncating input", # llama.cpp / llama-server patterns "slot context", # "slot context: N tokens, prompt N tokens" "n_ctx_slot", # Chinese error messages (some providers return these) "超过最大长度", "上下文长度", # AWS Bedrock Converse API error patterns "input is too long", "max input token", "input token", "exceeds the maximum number of input tokens", ] # Model not found patterns _MODEL_NOT_FOUND_PATTERNS = [ "is not a valid model", "invalid model", "model not found", "model_not_found", "does not exist", "no such model", "unknown model", "unsupported model", ] # OpenRouter aggregator policy-block patterns. # # When a user's OpenRouter account privacy setting (or a per-request # `provider.data_collection: deny` preference) excludes the only endpoint # serving a model, OpenRouter returns 404 with a *specific* message that is # distinct from "model not found": # # "No endpoints available matching your guardrail restrictions and # data policy. Configure: https://openrouter.ai/settings/privacy" # # We classify this as `provider_policy_blocked` rather than # `model_not_found` because: # - The model *exists* — model_not_found is misleading in logs # - Provider fallback won't help: the account-level setting applies to # every call on the same OpenRouter account # - The error body already contains the fix URL, so the user gets # actionable guidance without us rewriting the message _PROVIDER_POLICY_BLOCKED_PATTERNS = [ "no endpoints available matching your guardrail", "no endpoints available matching your data policy", "no endpoints found matching your data policy", ] # Auth patterns (non-status-code signals) _AUTH_PATTERNS = [ "invalid api key", "invalid_api_key", "authentication", "unauthorized", "forbidden", "invalid token", "token expired", "token revoked", "access denied", ] # Anthropic thinking block signature patterns _THINKING_SIG_PATTERNS = [ "signature", # Combined with "thinking" check ] # Transport error type names _TRANSPORT_ERROR_TYPES = frozenset({ "ReadTimeout", "ConnectTimeout", "PoolTimeout", "ConnectError", "RemoteProtocolError", "ConnectionError", "ConnectionResetError", "ConnectionAbortedError", "BrokenPipeError", "TimeoutError", "ReadError", "ServerDisconnectedError", # SSL/TLS transport errors — transient mid-stream handshake/record # failures that should retry rather than surface as a stalled session. # ssl.SSLError subclasses OSError (caught by isinstance) but we list # the type names here so provider-wrapped SSL errors (e.g. when the # SDK re-raises without preserving the exception chain) still classify # as transport rather than falling through to the unknown bucket. "SSLError", "SSLZeroReturnError", "SSLWantReadError", "SSLWantWriteError", "SSLEOFError", "SSLSyscallError", # OpenAI SDK errors (not subclasses of Python builtins) "APIConnectionError", "APITimeoutError", }) # Server disconnect patterns (no status code, but transport-level). # These are the "ambiguous" patterns — a plain connection close could be # transient transport hiccup OR server-side context overflow rejection # (common when the API gateway disconnects instead of returning an HTTP # error for oversized requests). A large session + one of these patterns # triggers the context-overflow-with-compression recovery path. _SERVER_DISCONNECT_PATTERNS = [ "server disconnected", "peer closed connection", "connection reset by peer", "connection was closed", "network connection lost", "unexpected eof", "incomplete chunked read", ] # SSL/TLS transient failure patterns — intentionally distinct from # _SERVER_DISCONNECT_PATTERNS above. # # An SSL alert mid-stream is almost always a transport-layer hiccup # (flaky network, mid-session TLS renegotiation failure, load balancer # dropping the connection) — NOT a server-side context overflow signal. # So we want the retry path but NOT the compression path; lumping these # into _SERVER_DISCONNECT_PATTERNS would trigger unnecessary (and # expensive) context compression on any large-session SSL hiccup. # # The OpenSSL library constructs error codes by prepending a format string # to the uppercased alert reason; OpenSSL 3.x changed the separator # (e.g. `SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`), # which silently stopped matching anything explicit. Matching on the # stable substrings (`bad record mac`, `ssl alert`, `tls alert`, etc.) # survives future OpenSSL format churn without code changes. _SSL_TRANSIENT_PATTERNS = [ # Space-separated (human-readable form, Python ssl module, most SDKs) "bad record mac", "ssl alert", "tls alert", "ssl handshake failure", "tlsv1 alert", "sslv3 alert", # Underscore-separated (OpenSSL error code tokens, e.g. # `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC`, `SSLV3_ALERT_BAD_RECORD_MAC`) "bad_record_mac", "ssl_alert", "tls_alert", "tls_alert_internal_error", # Python ssl module prefix, e.g. "[SSL: BAD_RECORD_MAC]" "[ssl:", ] # ── Classification pipeline ───────────────────────────────────────────── def classify_api_error( error: Exception, *, provider: str = "", model: str = "", approx_tokens: int = 0, context_length: int = 200000, num_messages: int = 0, ) -> ClassifiedError: """Classify an API error into a structured recovery recommendation. Priority-ordered pipeline: 1. Special-case provider-specific patterns (thinking sigs, tier gates) 2. HTTP status code + message-aware refinement 3. Error code classification (from body) 4. Message pattern matching (billing vs rate_limit vs context vs auth) 5. SSL/TLS transient alert patterns → retry as timeout 6. Server disconnect + large session → context overflow 7. Transport error heuristics 8. Fallback: unknown (retryable with backoff) Args: error: The exception from the API call. provider: Current provider name (e.g. "openrouter", "anthropic"). model: Current model slug. approx_tokens: Approximate token count of the current context. context_length: Maximum context length for the current model. Returns: ClassifiedError with reason and recovery action hints. """ status_code = _extract_status_code(error) error_type = type(error).__name__ # Copilot/GitHub Models RateLimitError may not set .status_code; force 429 # so downstream rate-limit handling (classifier reason, pool rotation, # fallback gating) fires correctly instead of misclassifying as generic. if status_code is None and error_type == "RateLimitError": status_code = 429 body = _extract_error_body(error) error_code = _extract_error_code(body) # Build a comprehensive error message string for pattern matching. # str(error) alone may not include the body message (e.g. OpenAI SDK's # APIStatusError.__str__ returns the first arg, not the body). Append # the body message so patterns like "try again" in 402 disambiguation # are detected even when only present in the structured body. # # Also extract metadata.raw — OpenRouter wraps upstream provider errors # inside {"error": {"message": "Provider returned error", "metadata": # {"raw": ""}}} and the real error message (e.g. # "context length exceeded") is only in the inner JSON. _raw_msg = str(error).lower() _body_msg = "" _metadata_msg = "" if isinstance(body, dict): _err_obj = body.get("error", {}) if isinstance(_err_obj, dict): _body_msg = str(_err_obj.get("message") or "").lower() # Parse metadata.raw for wrapped provider errors _metadata = _err_obj.get("metadata", {}) if isinstance(_metadata, dict): _raw_json = _metadata.get("raw") or "" if isinstance(_raw_json, str) and _raw_json.strip(): try: import json _inner = json.loads(_raw_json) if isinstance(_inner, dict): _inner_err = _inner.get("error", {}) if isinstance(_inner_err, dict): _metadata_msg = str(_inner_err.get("message") or "").lower() except (json.JSONDecodeError, TypeError): pass if not _body_msg: _body_msg = str(body.get("message") or "").lower() # Combine all message sources for pattern matching parts = [_raw_msg] if _body_msg and _body_msg not in _raw_msg: parts.append(_body_msg) if _metadata_msg and _metadata_msg not in _raw_msg and _metadata_msg not in _body_msg: parts.append(_metadata_msg) error_msg = " ".join(parts) provider_lower = (provider or "").strip().lower() model_lower = (model or "").strip().lower() def _result(reason: FailoverReason, **overrides) -> ClassifiedError: defaults = { "reason": reason, "status_code": status_code, "provider": provider, "model": model, "message": _extract_message(error, body), } defaults.update(overrides) return ClassifiedError(**defaults) # ── 1. Provider-specific patterns (highest priority) ──────────── # Anthropic thinking block signature invalid (400). # Don't gate on provider — OpenRouter proxies Anthropic errors, so the # provider may be "openrouter" even though the error is Anthropic-specific. # The message pattern ("signature" + "thinking") is unique enough. if ( status_code == 400 and "signature" in error_msg and "thinking" in error_msg ): return _result( FailoverReason.thinking_signature, retryable=True, should_compress=False, ) # Anthropic long-context tier gate (429 "extra usage" + "long context") if ( status_code == 429 and "extra usage" in error_msg and "long context" in error_msg ): return _result( FailoverReason.long_context_tier, retryable=True, should_compress=True, ) # ── 2. HTTP status code classification ────────────────────────── if status_code is not None: classified = _classify_by_status( status_code, error_msg, error_code, body, provider=provider_lower, model=model_lower, approx_tokens=approx_tokens, context_length=context_length, num_messages=num_messages, result_fn=_result, ) if classified is not None: return classified # ── 3. Error code classification ──────────────────────────────── if error_code: classified = _classify_by_error_code(error_code, error_msg, _result) if classified is not None: return classified # ── 4. Message pattern matching (no status code) ──────────────── classified = _classify_by_message( error_msg, error_type, approx_tokens=approx_tokens, context_length=context_length, result_fn=_result, ) if classified is not None: return classified # ── 5. SSL/TLS transient errors → retry as timeout (not compression) ── # SSL alerts mid-stream are transport hiccups, not server-side context # overflow signals. Classify before the disconnect check so a large # session doesn't incorrectly trigger context compression when the real # cause is a flaky TLS handshake. Also matches when the error is # wrapped in a generic exception whose message string carries the SSL # alert text but the type isn't ssl.SSLError (happens with some SDKs # that re-raise without chaining). if any(p in error_msg for p in _SSL_TRANSIENT_PATTERNS): return _result(FailoverReason.timeout, retryable=True) # ── 6. Server disconnect + large session → context overflow ───── # Must come BEFORE generic transport error catch — a disconnect on # a large session is more likely context overflow than a transient # transport hiccup. Without this ordering, RemoteProtocolError # always maps to timeout regardless of session size. is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS) if is_disconnect and not status_code: is_large = approx_tokens > context_length * 0.6 or approx_tokens > 120000 or num_messages > 200 if is_large: return _result( FailoverReason.context_overflow, retryable=True, should_compress=True, ) return _result(FailoverReason.timeout, retryable=True) # ── 7. Transport / timeout heuristics ─────────────────────────── if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)): return _result(FailoverReason.timeout, retryable=True) # ── 8. Fallback: unknown ──────────────────────────────────────── return _result(FailoverReason.unknown, retryable=True) # ── Status code classification ────────────────────────────────────────── def _classify_by_status( status_code: int, error_msg: str, error_code: str, body: dict, *, provider: str, model: str, approx_tokens: int, context_length: int, num_messages: int = 0, result_fn, ) -> Optional[ClassifiedError]: """Classify based on HTTP status code with message-aware refinement.""" if status_code == 401: # Not retryable on its own — credential pool rotation and # provider-specific refresh (Codex, Anthropic, Nous) run before # the retryability check in run_agent.py. If those succeed, the # loop `continue`s. If they fail, retryable=False ensures we # hit the client-error abort path (which tries fallback first). return result_fn( FailoverReason.auth, retryable=False, should_rotate_credential=True, should_fallback=True, ) if status_code == 403: # OpenRouter 403 "key limit exceeded" is actually billing if "key limit exceeded" in error_msg or "spending limit" in error_msg: return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) return result_fn( FailoverReason.auth, retryable=False, should_fallback=True, ) if status_code == 402: return _classify_402(error_msg, result_fn) if status_code == 404: # OpenRouter policy-block 404 — distinct from "model not found". # The model exists; the user's account privacy setting excludes the # only endpoint serving it. Falling back to another provider won't # help (same account setting applies). The error body already # contains the fix URL, so just surface it. if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS): return result_fn( FailoverReason.provider_policy_blocked, retryable=False, should_fallback=False, ) if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): return result_fn( FailoverReason.model_not_found, retryable=False, should_fallback=True, ) # Generic 404 with no "model not found" signal — could be a wrong # endpoint path (common with local llama.cpp / Ollama / vLLM when # the URL is slightly misconfigured), a proxy routing glitch, or # a transient backend issue. Classifying these as model_not_found # silently falls back to a different provider and tells the model # the model is missing, which is wrong and wastes a turn. Treat # as unknown so the retry loop surfaces the real error instead. return result_fn( FailoverReason.unknown, retryable=True, ) if status_code == 413: return result_fn( FailoverReason.payload_too_large, retryable=True, should_compress=True, ) if status_code == 429: # Already checked long_context_tier above; this is a normal rate limit return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, should_fallback=True, ) if status_code == 400: return _classify_400( error_msg, error_code, body, provider=provider, model=model, approx_tokens=approx_tokens, context_length=context_length, num_messages=num_messages, result_fn=result_fn, ) if status_code in (500, 502): return result_fn(FailoverReason.server_error, retryable=True) if status_code in (503, 529): return result_fn(FailoverReason.overloaded, retryable=True) # Other 4xx — non-retryable if 400 <= status_code < 500: return result_fn( FailoverReason.format_error, retryable=False, should_fallback=True, ) # Other 5xx — retryable if 500 <= status_code < 600: return result_fn(FailoverReason.server_error, retryable=True) return None def _classify_402(error_msg: str, result_fn) -> ClassifiedError: """Disambiguate 402: billing exhaustion vs transient usage limit. The key insight from OpenClaw: some 402s are transient rate limits disguised as payment errors. "Usage limit, try again in 5 minutes" is NOT a billing problem — it's a periodic quota that resets. """ # Check for transient usage-limit signals first has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS) has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS) if has_usage_limit and has_transient_signal: # Transient quota — treat as rate limit, not billing return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, should_fallback=True, ) # Confirmed billing exhaustion return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) def _classify_400( error_msg: str, error_code: str, body: dict, *, provider: str, model: str, approx_tokens: int, context_length: int, num_messages: int = 0, result_fn, ) -> ClassifiedError: """Classify 400 Bad Request — context overflow, format error, or generic.""" # Context overflow from 400 if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): return result_fn( FailoverReason.context_overflow, retryable=True, should_compress=True, ) # Some providers return model-not-found as 400 instead of 404 (e.g. OpenRouter). if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS): return result_fn( FailoverReason.provider_policy_blocked, retryable=False, should_fallback=False, ) if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): return result_fn( FailoverReason.model_not_found, retryable=False, should_fallback=True, ) # Some providers return rate limit / billing errors as 400 instead of 429/402. # Check these patterns before falling through to format_error. if any(p in error_msg for p in _RATE_LIMIT_PATTERNS): return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, should_fallback=True, ) if any(p in error_msg for p in _BILLING_PATTERNS): return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) # Generic 400 + large session → probable context overflow # Anthropic sometimes returns a bare "Error" message when context is too large err_body_msg = "" if isinstance(body, dict): err_obj = body.get("error", {}) if isinstance(err_obj, dict): err_body_msg = str(err_obj.get("message") or "").strip().lower() # Responses API (and some providers) use flat body: {"message": "..."} if not err_body_msg: err_body_msg = str(body.get("message") or "").strip().lower() is_generic = len(err_body_msg) < 30 or err_body_msg in ("error", "") is_large = approx_tokens > context_length * 0.4 or approx_tokens > 80000 or num_messages > 80 if is_generic and is_large: return result_fn( FailoverReason.context_overflow, retryable=True, should_compress=True, ) # Non-retryable format error return result_fn( FailoverReason.format_error, retryable=False, should_fallback=True, ) # ── Error code classification ─────────────────────────────────────────── def _classify_by_error_code( error_code: str, error_msg: str, result_fn, ) -> Optional[ClassifiedError]: """Classify by structured error codes from the response body.""" code_lower = error_code.lower() if code_lower in ("resource_exhausted", "throttled", "rate_limit_exceeded"): return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, ) if code_lower in ("insufficient_quota", "billing_not_active", "payment_required"): return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) if code_lower in ("model_not_found", "model_not_available", "invalid_model"): return result_fn( FailoverReason.model_not_found, retryable=False, should_fallback=True, ) if code_lower in ("context_length_exceeded", "max_tokens_exceeded"): return result_fn( FailoverReason.context_overflow, retryable=True, should_compress=True, ) return None # ── Message pattern classification ────────────────────────────────────── def _classify_by_message( error_msg: str, error_type: str, *, approx_tokens: int, context_length: int, result_fn, ) -> Optional[ClassifiedError]: """Classify based on error message patterns when no status code is available.""" # Payload-too-large patterns (from message text when no status_code) if any(p in error_msg for p in _PAYLOAD_TOO_LARGE_PATTERNS): return result_fn( FailoverReason.payload_too_large, retryable=True, should_compress=True, ) # Usage-limit patterns need the same disambiguation as 402: some providers # surface "usage limit" errors without an HTTP status code. A transient # signal ("try again", "resets at", …) means it's a periodic quota, not # billing exhaustion. has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS) if has_usage_limit: has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS) if has_transient_signal: return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, should_fallback=True, ) return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) # Billing patterns if any(p in error_msg for p in _BILLING_PATTERNS): return result_fn( FailoverReason.billing, retryable=False, should_rotate_credential=True, should_fallback=True, ) # Rate limit patterns if any(p in error_msg for p in _RATE_LIMIT_PATTERNS): return result_fn( FailoverReason.rate_limit, retryable=True, should_rotate_credential=True, should_fallback=True, ) # Context overflow patterns if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): return result_fn( FailoverReason.context_overflow, retryable=True, should_compress=True, ) # Auth patterns # Auth errors should NOT be retried directly — the credential is invalid and # retrying with the same key will always fail. Set retryable=False so the # caller triggers credential rotation (should_rotate_credential=True) or # provider fallback rather than an immediate retry loop. if any(p in error_msg for p in _AUTH_PATTERNS): return result_fn( FailoverReason.auth, retryable=False, should_rotate_credential=True, should_fallback=True, ) # Provider policy-block (aggregator-side guardrail) — check before # model_not_found so we don't mis-label as a missing model. if any(p in error_msg for p in _PROVIDER_POLICY_BLOCKED_PATTERNS): return result_fn( FailoverReason.provider_policy_blocked, retryable=False, should_fallback=False, ) # Model not found patterns if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): return result_fn( FailoverReason.model_not_found, retryable=False, should_fallback=True, ) return None # ── Helpers ───────────────────────────────────────────────────────────── def _extract_status_code(error: Exception) -> Optional[int]: """Walk the error and its cause chain to find an HTTP status code.""" current = error for _ in range(5): # Max depth to prevent infinite loops code = getattr(current, "status_code", None) if isinstance(code, int): return code # Some SDKs use .status instead of .status_code code = getattr(current, "status", None) if isinstance(code, int) and 100 <= code < 600: return code # Walk cause chain cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None) if cause is None or cause is current: break current = cause return None def _extract_error_body(error: Exception) -> dict: """Extract the structured error body from an SDK exception.""" body = getattr(error, "body", None) if isinstance(body, dict): return body # Some errors have .response.json() response = getattr(error, "response", None) if response is not None: try: json_body = response.json() if isinstance(json_body, dict): return json_body except Exception: pass return {} def _extract_error_code(body: dict) -> str: """Extract an error code string from the response body.""" if not body: return "" error_obj = body.get("error", {}) if isinstance(error_obj, dict): code = error_obj.get("code") or error_obj.get("type") or "" if isinstance(code, str) and code.strip(): return code.strip() # Top-level code code = body.get("code") or body.get("error_code") or "" if isinstance(code, (str, int)): return str(code).strip() return "" def _extract_message(error: Exception, body: dict) -> str: """Extract the most informative error message.""" # Try structured body first if body: error_obj = body.get("error", {}) if isinstance(error_obj, dict): msg = error_obj.get("message", "") if isinstance(msg, str) and msg.strip(): return msg.strip()[:500] msg = body.get("message", "") if isinstance(msg, str) and msg.strip(): return msg.strip()[:500] # Fallback to str(error) return str(error)[:500]