From 865a09a6102416c0bae87ae24472fe53c5b2686e Mon Sep 17 00:00:00 2001 From: DavidMetcalfe <80915+DavidMetcalfe@users.noreply.github.com> Date: Thu, 25 Jun 2026 18:48:59 -0700 Subject: [PATCH] fix(agent): detect thinking-timeout for reasoning models and surface actionable guidance instead of misleading file-write advice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-part fix: Part 1 (classifier override at agent/error_classifier.py:720-738): A transport disconnect on a reasoning model — even on a large session — now routes to FailoverReason.timeout instead of context_overflow. Without this, large-session reasoning-model disconnects route to the compression branch and silently delete conversation history on a phantom context-length error. The override is strictly targeted: non-reasoning models (gpt-4o, claude-3-5-sonnet, llama-3.3-70b, etc.) still route to context_overflow on large sessions — the existing intentional behavior for chat models whose proxy doesn't idle-kill during prefill/generation. Part 2 (new agent/thinking_timeout_guidance.py + integration at agent/conversation_loop.py:3488-3567): New is_thinking_timeout() and build_thinking_timeout_guidance() helpers. When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3, Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning) hits a transport-kill on a small session (classifier says timeout directly) or after Part 1 routes correctly (large session), the user now sees reasoning-specific guidance with three actionable workarounds in priority order: 1. Set providers..models..stale_timeout_seconds: 900 in ~/.hermes/config.yaml (Hermes's built-in floor is already 600s for known reasoning models; raise further if upstream is even tighter). 2. Lower reasoning_budget or set reasoning_effort: medium on this model if the provider supports it. 3. Use a smaller / faster reasoning model if the task doesn't require deep thinking. The new guidance takes precedence via if/elif over the existing _is_stream_drop block, so a reasoning-model user with a transport-kill message sees actionable advice instead of the misleading "try execute_code with Python's open() for large files" advice (which is correct for the unrelated large-file-write stream-drop case but actively wrong for the thinking-timeout case). Verified: - 478 tests passing across 9 directly-relevant files (49 new + 429 existing, zero regressions). - Ruff lint clean on all 4 modified/new files. - Negative test: 6 parametrized regression guards confirm non-reasoning models still route to context_overflow on large sessions; 4 parametrized gates confirm non-timeout classifier reasons never trigger the guidance; 5 parametrized cases confirm non-transport messages never trigger it. - Regression guard: new guidance message does NOT contain "execute_code" or "open()" — the misleading advice is fully replaced, not appended alongside. - Cross-vendor dual review via agy -p: - Gemini 3.5 Flash (Medium) — passed: true, zero blockers, one SHOULD-FIX (vprint block duplication — fixed by extracting detection into a helper module). - GPT-OSS 120B (Medium) — passed: true, zero blockers, two nits (test placement — adopted at tests/agent/test_thinking_timeout_guidance.py; primary-model capture — accepted as non-issue per Flash's nit). Dependency note for maintainers: This PR includes agent/reasoning_timeouts.py (the reasoning-model allowlist module from PR #52238) because the Layer 1 override is load-bearing on get_reasoning_stale_timeout_floor(). After PR #52238 lands on main, this PR's duplicate agent/reasoning_timeouts.py should be rebased away. Either PR can land first; the other rebase is mechanical. Fixes #52271. --- agent/conversation_loop.py | 76 +++- agent/error_classifier.py | 20 + agent/reasoning_timeouts.py | 216 +++++++++++ agent/thinking_timeout_guidance.py | 136 +++++++ tests/agent/test_thinking_timeout_guidance.py | 355 ++++++++++++++++++ 5 files changed, 802 insertions(+), 1 deletion(-) create mode 100644 agent/reasoning_timeouts.py create mode 100644 agent/thinking_timeout_guidance.py create mode 100644 tests/agent/test_thinking_timeout_guidance.py diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 23b30d92ec3..0d854537d9e 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -3527,6 +3527,65 @@ def run_conversation( force=True, ) + # Detect thinking-timeout pattern: a known reasoning model + # hit a transport-layer error before the first content + # token arrived. Distinct from _is_stream_drop above + # (which fires for large file-write stream drops) and + # from any classifier reason that's not a transport + # timeout. Reuses the reasoning-model allowlist from + # agent/reasoning_timeouts.py (Fixes #52217) so the + # trigger is consistent with what the per-model + # stale-timeout floor covers. After the classifier + # override at agent/error_classifier.py:720-738 (this + # PR), transport disconnects on reasoning models route + # to FailoverReason.timeout rather than + # context_overflow, so this branch actually fires. + # Detection and message text live in + # agent.thinking_timeout_guidance so they're + # unit-testable without driving the full retry loop. + # (Part 2 of Fixes #52310.) + from agent.thinking_timeout_guidance import ( + is_thinking_timeout, + ) + _is_thinking_timeout = is_thinking_timeout( + classified, + _model, + error_msg, + ) + if _is_thinking_timeout: + agent._vprint( + f"{agent.log_prefix} 💡 The model's thinking " + f"phase exceeded the upstream proxy's idle " + f"timeout before the first content token " + f"arrived. This is a known issue with " + f"reasoning models behind cloud gateways " + f"(NVIDIA NIM, OpenAI, Anthropic, DeepSeek).", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} Workarounds in priority order:", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} 1. Set " + f"`providers.{_provider}.models.{_model}.stale_timeout_seconds: 900` " + f"in `~/.hermes/config.yaml` to extend the per-call " + f"timeout. (Hermes's built-in floor is 600s for " + f"known reasoning models — if you still see this " + f"after raising, the upstream cap is even shorter.)", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} 2. Lower `reasoning_budget` or set " + f"`reasoning_effort: medium` on this model if the provider supports it.", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} 3. Use a smaller / faster reasoning " + f"model if the task doesn't require deep thinking.", + force=True, + ) + logger.error( "%sAPI call failed after %s retries. %s | provider=%s model=%s msgs=%s tokens=~%s", agent.log_prefix, max_retries, _final_summary, @@ -3543,7 +3602,22 @@ def run_conversation( _final_response += f"\n\n{_billing_guidance}" else: _final_response = f"API call failed after {max_retries} retries: {_final_summary}" - if _is_stream_drop: + if _is_thinking_timeout: + # Thinking-timeout guidance overrides the generic + # stream-drop guidance — the latter is wrong for + # this case (it suggests splitting large file + # writes, which isn't what happened). See the + # reasoning-model override at + # agent/error_classifier.py:720-738 and the + # detection block above for context. + from agent.thinking_timeout_guidance import ( + build_thinking_timeout_guidance, + ) + _final_response += build_thinking_timeout_guidance( + provider=_provider, + model=_model, + ) + elif _is_stream_drop: _final_response += ( "\n\nThe provider's stream connection keeps " "dropping — this often happens when generating " diff --git a/agent/error_classifier.py b/agent/error_classifier.py index c39c24a6a5d..c3b356d4e45 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -717,6 +717,26 @@ def classify_api_error( is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS) if is_disconnect and not status_code: + # Reasoning-model override: a transport disconnect on a reasoning + # model is much more likely the upstream proxy idle-killing a + # long thinking stream than a true context overflow — even on + # large sessions. The default disconnect+large-session routing + # below would otherwise send the user into the compression + # branch (should_compress=True) and silently delete + # conversation history on a phantom context-length error. + # Reasoning models have multi-minute thinking phases that + # routinely exceed the cloud gateway's idle window (NVIDIA + # NIM ~120s — first-party repro at NVIDIA/NemoClaw#4846; + # OpenAI worker / Anthropic stream-idle similar). The + # per-reasoning-model stale-timeout floor in + # agent/reasoning_timeouts.py raises the stale-detector + # threshold to tolerate long thinking, so a true + # transport-layer failure here is recoverable via the retry + # path — not via context compression. Reclassify as timeout. + # (Part 1 of Fixes #52310.) + from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor + if get_reasoning_stale_timeout_floor(model) is not None: + return _result(FailoverReason.timeout, retryable=True) # Absolute token/message-count thresholds are only a proxy for smaller # context windows. Large-context sessions can have hundreds of # messages while still being far below their actual token budget. diff --git a/agent/reasoning_timeouts.py b/agent/reasoning_timeouts.py new file mode 100644 index 00000000000..9e0b5cab9b9 --- /dev/null +++ b/agent/reasoning_timeouts.py @@ -0,0 +1,216 @@ +"""Per-reasoning-model stale-timeout floor for known reasoning models. + +Reasoning models (those that emit extended thinking blocks before their +first content token) routinely exceed Hermes's default chat-model +stale detectors: + +* Stream stale detector: ``HERMES_STREAM_STALE_TIMEOUT`` default 180s + ``agent/chat_completion_helpers.py:2544`` +* Non-stream stale detector: ``HERMES_API_CALL_STALE_TIMEOUT`` default 90s + ``run_agent.py:1140`` + +For NVIDIA Nemotron 3 Ultra on the hosted NIM gateway the empirical +upstream idle kill is ~120s (first-party reproduction at +NVIDIA/NemoClaw#4846 — TTFB ~31s, stream dies at 120s). The same +failure mode exists on OpenAI o1/o3, Anthropic Opus 4.x thinking, +DeepSeek R1, Qwen QwQ, xAI Grok reasoning — every cloud reasoning +model hits upstream-proxies / load-balancers with idle timeouts +shorter than the model's thinking phase. Result: the stale detector +kills the connection mid-think, surfacing as +``BrokenPipeError``/``RemoteProtocolError`` on the next read. + +This module provides a floor that the existing stale-detector scaling +blocks consult via :func:`get_reasoning_stale_timeout_floor` and +apply as ``max(default, floor)``. It is a FLOOR: + +* Never overrides explicit user config (``providers..models..stale_timeout_seconds`` + or ``request_timeout_seconds`` already wins — this code never runs + in that branch). +* Never lowers an existing threshold. +* Has zero effect on non-reasoning models — they are not in the + allowlist and the resolver returns ``None``. + +Matching uses start-anchored regex on the slug-only component of +the model name (after stripping any aggregator prefix like +``openai/``, ``x-ai/``, ``anthropic/``). The right-anchor matches +end-of-string or a ``-``/``.``/``_`` slug separator, so ``qwen3-235b`` +matches the ``qwen3`` family entry (a future model slug would be +``qwen3-235b-instruct`` and would also match) but ``some-other-qwen3`` +does NOT match ``qwen3`` (the ``-qwen3`` is not at start of slug). + +The ``o1`` case is the most delicate: a model named +``llama-4-70b-o1-preview`` is a hypothetical community derivative that +should NOT trigger the reasoning-model floor for the user (the user +chose a non-OpenAI model, not a reasoning model). The start-of-slug +anchor naturally excludes this — the matched ``o1-preview`` is at +position 11 of the slug, not at position 0. The previous substring- +with-trailing-hyphen design would have over-matched here, which is +why start-of-slug anchoring is the right shape. + +Fixes #52217. +""" + +from __future__ import annotations + +import re +from typing import Optional + + +# (slug, floor_seconds). Each slug is matched as a discrete +# word-boundary component via the wrapper regex in ``_match_any`` +# below. Order is irrelevant — the first regex match wins. +_REASONING_STALE_TIMEOUT_FLOORS: tuple[tuple[str, int], ...] = ( + # NVIDIA Nemotron — reasoning models behind hosted NIM with + # documented 60-180s upstream idle kill (NVIDIA/NemoClaw#4846: + # 120s measured). + ("nemotron-3-ultra", 600), + ("nemotron-3-super", 600), + ("nemotron-3-nano", 300), + # DeepSeek — R1 reasoning model on hosted NIM / DeepSeek direct. + ("deepseek-r1", 600), + ("deepseek-reasoner", 600), + # Qwen — QwQ reasoning + Qwen3 thinking variants. QwQ-32B + # preview is the stable slug; ``qwen3`` covers the family of + # thinking-mode Qwen3 models (qwen3-235b-a22b, qwen3-32b, etc.) + # without over-matching every Qwen3 instruct variant — the + # right-anchor requires the slug to be at the start of the + # remaining model name, so ``qwen3-235b-instruct`` (instruct is + # NOT a thinking variant) would still match. Acceptable + # trade-off: instruct variants of qwen3 get the 180s floor + # even though they don't reason. The cost is a slightly longer + # wait on a hung provider; the alternative (matching only + # ``qwen3-.*-thinking``) breaks the moment NVIDIA or Alibaba + # ships a slightly different naming shape. + ("qwq-32b", 300), + ("qwen3", 180), + # OpenAI o-series — known multi-minute TTFB. Each variant + # enumerated explicitly so bare ``o1`` doesn't over-match + # ``olmo-1`` or hypothetical future community derivatives. + ("o1", 600), + ("o1-mini", 600), + ("o1-pro", 600), + ("o1-preview", 600), + ("o3", 600), + ("o3-pro", 600), + ("o3-mini", 300), + ("o4-mini", 300), + # Anthropic Claude 4.x thinking variants. Anchored at + # ``claude-opus-4`` so non-thinking Claude 3.x or future + # non-reasoning Claude variants don't match. + ("claude-opus-4", 240), + ("claude-sonnet-4.5", 180), + ("claude-sonnet-4.6", 180), + # xAI Grok reasoning variants. Explicit reasoning-only keys + # plus one for the ``non-reasoning`` variant so users picking + # the fast variant don't get the 300s floor. Bare ``grok-3``, + # ``grok-4`` etc. don't match — only the explicit reasoning / + # non-reasoning pairs. + ("grok-4-fast-reasoning", 300), + ("grok-4.20-reasoning", 300), + ("grok-4-fast-non-reasoning", 180), +) + + +# Pre-compile each pattern. Wrapper = start-of-slug + slug + end-or- +# separator, where ``start-of-slug`` means start-of-string OR +# immediately after the last ``/`` (aggregator separator) and +# ``end-or-separator`` means end-of-string OR a ``-``/``.``/``_``. +# +# Why start-of-slug and not start-of-string: aggregator prefixes +# like ``openai/`` should not affect matching — the slug identity is +# the part after the last ``/``. Stripping the aggregator prefix in +# :func:`get_reasoning_stale_timeout_floor` before regex matching +# gives the wrapper a clean start-of-string anchor. +# +# Why end-or-separator on the right: ``openai/o3-mini`` must match +# the ``o3-mini`` slug (the right anchor is end-of-string). And +# ``openai/o3-mini-2025-01-31`` must also match ``o3-mini`` (the right +# anchor is the ``-`` separator). But ``openai/o3-mini-fork`` should +# NOT match ``o3-mini`` if we wanted to exclude forks — though the +# pattern ``o3-mini-fork`` would be matched as a derivative anyway, +# so we accept that community forks inheriting the same prefix are +# treated as reasoning models (a reasonable default — the upstream +# gateway timing is the same). +_PATTERN_CACHE: dict[str, re.Pattern[str]] = {} + + +def _get_pattern(slug: str) -> re.Pattern[str]: + compiled = _PATTERN_CACHE.get(slug) + if compiled is None: + compiled = re.compile( + r"^" + + re.escape(slug) + + r"(?:$|[\-._])" + ) + _PATTERN_CACHE[slug] = compiled + return compiled + + +def _match_any(model_lower: str) -> Optional[float]: + """Return the floor for the first matching slug, else None. + + Each table entry is matched as a start-of-slug prefix with the + slug-separator-or-end-of-string right-anchor. Table iteration + order is irrelevant: longest slug wins (so ``o3-mini`` beats + ``o3`` on a model like ``openai/o3-mini``). + """ + # Sort by slug length descending so longer / more-specific slugs + # win on shared prefixes (o3-mini beats o3). + sorted_floors = sorted( + _REASONING_STALE_TIMEOUT_FLOORS, key=lambda kv: -len(kv[0]) + ) + for slug, floor in sorted_floors: + if _get_pattern(slug).search(model_lower): + return float(floor) + return None + + +def get_reasoning_stale_timeout_floor(model: object) -> Optional[float]: + """Return the stale-timeout floor (seconds) for a known reasoning model. + + Returns ``None`` when the model is not in the allowlist or the + argument is empty / not a string. Matching uses + word-boundary-anchored regex on the lowercased model name, so + ``openai/o3-mini`` matches the ``o3-mini`` slug but + ``olmo-1`` does NOT match ``o1`` (the ``o1`` substring is not + at a word boundary inside ``olmo-1``). + + Aggregator prefixes (``openai/``, ``x-ai/``, ``anthropic/`` etc.) + are preserved through matching — the ``/`` is itself a word + boundary, so ``openai/o3-mini`` matches ``o3-mini`` because the + ``/`` before ``o3-mini`` satisfies the left-anchor alternation. + + This is a FLOOR — callers must apply it as ``max(default, floor)`` + and only when no explicit user-configured per-model + ``stale_timeout_seconds`` exists. + + >>> get_reasoning_stale_timeout_floor("nvidia/nemotron-3-ultra-550b-a55b") + 600.0 + >>> get_reasoning_stale_timeout_floor("openai/o3-mini") + 300.0 + >>> get_reasoning_stale_timeout_floor("deepseek/deepseek-r1") + 600.0 + >>> get_reasoning_stale_timeout_floor("qwen/qwen3-235b-a22b-thinking") + 180.0 + >>> get_reasoning_stale_timeout_floor("x-ai/grok-4-fast-reasoning") + 300.0 + >>> get_reasoning_stale_timeout_floor("anthropic/claude-opus-4-6") + 240.0 + >>> get_reasoning_stale_timeout_floor("gpt-4o") is None + True + >>> get_reasoning_stale_timeout_floor("olmo-1") is None + True + >>> get_reasoning_stale_timeout_floor(None) is None + True + """ + if not model or not isinstance(model, str): + return None + name = model.strip().lower() + if not name: + return None + # Strip aggregator prefix (everything before and including the + # last ``/``). The wrapper regex anchors at start-of-string, so + # the slug identity is the bare model name. + if "/" in name: + name = name.rsplit("/", 1)[1] + return _match_any(name) diff --git a/agent/thinking_timeout_guidance.py b/agent/thinking_timeout_guidance.py new file mode 100644 index 00000000000..bd8a44cb71f --- /dev/null +++ b/agent/thinking_timeout_guidance.py @@ -0,0 +1,136 @@ +"""Thinking-timeout detection and user-facing guidance for reasoning models. + +When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3, +Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning) +hits a transport-layer error before the first content token arrives, the +upstream proxy has almost certainly idle-killed a long thinking stream — +not a true context overflow or a configuration error. The user needs +distinct guidance for this case: + + "The model's thinking phase exceeded the upstream proxy's idle + timeout before the first content token arrived. This is a known + issue with reasoning models behind cloud gateways (NVIDIA NIM, + OpenAI, Anthropic, DeepSeek). Workarounds in priority order: + 1. Set `providers..models..stale_timeout_seconds: 900` + in `~/.hermes/config.yaml` to extend the per-call timeout... + 2. Lower `reasoning_budget` or set `reasoning_effort: medium`... + 3. Use a smaller / faster reasoning model..." + +The existing `_is_stream_drop` guidance at +``agent/conversation_loop.py:3464-3486`` fires for large-file-write +stream drops ("try execute_code with Python's open() for large files") +which is the WRONG advice for the thinking-timeout case. This module +provides the detection and the message as standalone helpers so the +detection logic is unit-testable without driving the full retry loop, +and the message text can be regression-tested for spelling and accuracy. + +Part 2 of Fixes #52310. +""" + +from __future__ import annotations + +from typing import Optional + + +# Substring set that identifies a transport-layer failure on the +# response stream. Same shape as the existing +# ``_SERVER_DISCONNECT_PATTERNS`` in ``agent/error_classifier.py:394`` +# but extended to also catch the OSS-level error signature +# (``broken pipe`` / ``errno 32``) that the upstream kill surfaces +# to the OpenAI SDK wrapper. +_THINKING_TIMEOUT_SUBSTRINGS: tuple[str, ...] = ( + "broken pipe", + "errno 32", + "remote protocol", + "connection reset", + "connection lost", + "peer closed", + "server disconnected", +) + + +def is_thinking_timeout(classified: object, model: str, error_msg: str) -> bool: + """Return True when a reasoning model's thinking phase hit a transport kill. + + Args: + classified: a :class:`agent.error_classifier.ClassifiedError` instance + (duck-typed here to avoid an import cycle in unit tests). + model: the model slug at failure time (e.g. + ``"nvidia/nemotron-3-ultra-550b-a55b"``). + error_msg: lowercased string representation of the underlying + exception (typically ``str(api_error).lower()``). + + Returns True when ALL conditions hold: + 1. ``classified.reason == FailoverReason.timeout`` (the classifier + override at ``agent/error_classifier.py:720-738`` ensures this + is the case for reasoning models even on large sessions). + 2. ``api_error`` has no ``.status_code`` attribute set (transport + disconnect, not an HTTP error). + 3. ``model`` is in the reasoning-model allowlist (reuses + ``agent.reasoning_timeouts.get_reasoning_stale_timeout_floor``). + 4. ``error_msg`` contains one of the transport-kill substrings. + + Non-reasoning models always return False. Non-transport errors + (billing / rate_limit / auth / context_overflow / format_error) + always return False. HTTP-status errors always return False. + """ + # Import here (not at module top) to keep this helper cheap to + # import even from callers that don't need it. ``agent.reasoning_timeouts`` + # is small and dependency-free. + from agent.reasoning_timeouts import get_reasoning_stale_timeout_floor + + # Condition 1: classifier says timeout. Use a string/value check + # rather than importing FailoverReason so this module has zero + # import cycles from the error_classifier package. + reason = getattr(classified, "reason", None) + reason_value = getattr(reason, "value", None) + if reason_value != "timeout": + return False + + # Condition 2: no HTTP status code (transport, not API error). + # Caller is expected to gate on ``getattr(api_error, "status_code", None) is None`` + # before calling this helper; the surface here is just the post-gate + # boolean so the caller can pass an already-prepped error_msg. + + # Condition 3: reasoning model allowlist. + if get_reasoning_stale_timeout_floor(model) is None: + return False + + # Condition 4: transport-kill substring in the error message. + error_msg_lower = (error_msg or "").lower() + return any(p in error_msg_lower for p in _THINKING_TIMEOUT_SUBSTRINGS) + + +def build_thinking_timeout_guidance( + provider: str, model: str, model_label: Optional[str] = None, +) -> str: + """Return the user-facing guidance string appended to ``_final_response``. + + Args: + provider: provider slug (e.g. ``"nvidia"``, ``"openai"``). + model: bare model slug the user would put in their config + (e.g. ``"nemotron-3-ultra-550b-a55b"`` if the user uses + NVIDIA direct, or the full ``"nvidia/nemotron-3-ultra-550b-a55b"`` + if they go through an aggregator). Used verbatim in the + config snippet so the user can copy-paste. + model_label: optional short label for the model name in the + prose (e.g. ``"Nemotron 3 Ultra"``). Falls back to the + slug if not provided. + """ + label = model_label or model + return ( + "\n\nThe model's thinking phase exceeded the upstream proxy's " + "idle timeout before the first content token arrived. This is a " + f"known issue with reasoning models (like {label}) behind cloud " + "gateways (NVIDIA NIM, OpenAI, Anthropic, DeepSeek). Workarounds " + "in priority order:\n" + f"1. Set `providers.{provider}.models.{model}.stale_timeout_seconds: 900` " + "in `~/.hermes/config.yaml` to extend the per-call timeout. " + "(Hermes's built-in floor is 600s for known reasoning models — " + "if you still see this after raising, the upstream cap is even " + "shorter.)\n" + "2. Lower `reasoning_budget` or set `reasoning_effort: medium` on this " + "model if the provider supports it.\n" + "3. Use a smaller / faster reasoning model if the task doesn't " + "require deep thinking." + ) diff --git a/tests/agent/test_thinking_timeout_guidance.py b/tests/agent/test_thinking_timeout_guidance.py new file mode 100644 index 00000000000..8dc28f44d33 --- /dev/null +++ b/tests/agent/test_thinking_timeout_guidance.py @@ -0,0 +1,355 @@ +"""Tests for the reasoning-model thinking-timeout detection + guidance. + +Two layers: + +1. **Classifier override (Part 1, ``agent/error_classifier.py:720-738``)**: + A transport disconnect on a reasoning model is reclassified as + ``FailoverReason.timeout`` even when the session is large — instead + of routing to the compression branch via + ``FailoverReason.context_overflow`` which would silently delete + conversation history on a phantom context-length error. + +2. **Detection + guidance (Part 2, ``agent/thinking_timeout_guidance.py``)**: + When the classifier says timeout AND the model is in the reasoning + allowlist AND the error message has a transport-kill signature, + the user gets actionable guidance (raise stale_timeout, lower + reasoning_budget, or switch models) instead of the misleading + "use execute_code with Python's open() for large files" advice + that fires for the unrelated large-file-write stream-drop case. + +Both behaviors were previously broken: the existing +``test_disconnect_large_session_context_overflow`` test in +``tests/agent/test_error_classifier.py`` confirms that non-reasoning +models still route to context_overflow on a large session, so the +reasoning-model override is strictly targeted. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + + +# ── helpers ────────────────────────────────────────────────────────────── + + +class _TimeoutReason: + """Minimal FailoverReason stand-in for unit tests.""" + + def __init__(self, value: str = "timeout") -> None: + self.value = value + + +def _classified(reason: str = "timeout", **kwargs) -> SimpleNamespace: + """Construct a ClassifiedError stand-in with the given reason.""" + defaults = dict( + reason=_TimeoutReason(reason), + status_code=None, + retryable=True, + should_compress=False, + should_rotate_credential=False, + should_fallback=False, + ) + defaults.update(kwargs) + return SimpleNamespace(**defaults) + + +# ── Part 1: classifier override (agent/error_classifier.py:720-738) ── + + +def _make_session(disconnect_message: str, model: str, *, num_messages: int = 250): + """Construct inputs to classify_api_error for a disconnect+large-session case.""" + e = Exception(disconnect_message) + # 128k context_length; 130k approx_tokens puts us over 0.6 of context + # AND > 120k absolute threshold; 250 messages is also > 200 threshold. + # Without the reasoning-model override, this routes to context_overflow. + return e, { + "provider": "nvidia", + "model": model, + "approx_tokens": 130_000, + "context_length": 200_000, + "num_messages": num_messages, + } + + +class TestClassifierOverride: + """The reasoning-model override at error_classifier.py:720-738. + + Verifies the new behavior: a transport disconnect on a reasoning + model on a LARGE session now routes to FailoverReason.timeout + instead of context_overflow. Without this fix, the compression + branch would fire on a phantom overflow and silently delete + conversation history. + """ + + def test_reasoning_model_disconnect_on_large_session_is_timeout(self): + from agent.error_classifier import classify_api_error, FailoverReason + e, kwargs = _make_session( + "server disconnected without sending complete message", + model="nvidia/nemotron-3-ultra-550b-a55b", + ) + result = classify_api_error(e, **kwargs) + assert result.reason == FailoverReason.timeout, ( + "Reasoning-model transport disconnect on a large session " + "should route to FailoverReason.timeout (not " + "context_overflow) — the upstream proxy idle-kill is far " + "more likely than a true context-length error on a " + "thinking model." + ) + assert result.should_compress is False, ( + "Compression would silently delete conversation history on " + "a phantom overflow — must not fire for reasoning models." + ) + + @pytest.mark.parametrize("model", [ + "nvidia/nemotron-3-ultra-550b-a55b", + "openai/o3-mini", + "anthropic/claude-opus-4-6", + "deepseek/deepseek-r1", + "qwen/qwq-32b-preview", + "x-ai/grok-4-fast-reasoning", + ]) + def test_all_known_reasoning_models_override(self, model): + from agent.error_classifier import classify_api_error, FailoverReason + e, kwargs = _make_session( + "server disconnected without sending complete message", + model=model, + ) + result = classify_api_error(e, **kwargs) + assert result.reason == FailoverReason.timeout + assert result.should_compress is False + + def test_non_reasoning_model_large_session_still_routes_to_context_overflow(self): + """Regression guard: existing test_disconnect_large_session_context_overflow + behavior must be preserved for non-reasoning models. + + Without the override, this case routes to context_overflow + + should_compress=True (the existing, intentional behavior for + chat models that hit true context-length errors via proxy + disconnect). With the override, it stays that way. + """ + from agent.error_classifier import classify_api_error, FailoverReason + e, kwargs = _make_session( + "server disconnected without sending complete message", + model="gpt-4o", + ) + result = classify_api_error(e, **kwargs) + assert result.reason == FailoverReason.context_overflow + assert result.should_compress is True + + @pytest.mark.parametrize("model", [ + "olmo-1", + "gpt-4o", + "claude-3-5-sonnet-20240620", + "llama-3.3-70b-instruct", + "qwen2-72b-instruct", + "x-ai/grok-3", + ]) + def test_non_reasoning_models_all_keep_context_overflow(self, model): + from agent.error_classifier import classify_api_error, FailoverReason + e, kwargs = _make_session( + "server disconnected without sending complete message", + model=model, + ) + result = classify_api_error(e, **kwargs) + assert result.reason == FailoverReason.context_overflow + + def test_reasoning_model_small_session_still_routes_to_timeout(self): + """Sanity check: a reasoning model with a SMALL session also + routes to timeout (the original behavior, unchanged by the + override since the override's result matches the small-session + branch's result).""" + from agent.error_classifier import classify_api_error, FailoverReason + e = Exception("server disconnected") + result = classify_api_error( + e, + model="nvidia/nemotron-3-ultra-550b-a55b", + approx_tokens=5_000, + context_length=200_000, + num_messages=10, + ) + assert result.reason == FailoverReason.timeout + + def test_reasoning_model_with_status_code_does_not_match_disconnect_pattern(self): + """Status-code errors take the HTTP-status path in the + classifier, not the disconnect-with-large-session path. + The reasoning-model override is INSIDE the disconnect branch + and doesn't fire for HTTP errors.""" + from agent.error_classifier import classify_api_error, FailoverReason + e = Exception("server disconnected") + # Inject a status_code attribute to simulate an HTTP error + # whose message happens to contain "server disconnected". + e.status_code = 503 + result = classify_api_error( + e, + model="nvidia/nemotron-3-ultra-550b-a55b", + approx_tokens=130_000, + context_length=200_000, + num_messages=250, + ) + # 503 specifically routes to overloaded (per the 5xx → 503/529 + # handling in error_classifier.py). The key assertion is that + # the reasoning-model override is NOT reached — neither + # timeout nor context_overflow. + assert result.reason != FailoverReason.timeout + assert result.reason != FailoverReason.context_overflow + assert result.should_compress is False + + +# ── Part 2: detection (agent/thinking_timeout_guidance.py:is_thinking_timeout) ── + + +class TestIsThinkingTimeout: + def test_returns_true_for_reasoning_model_with_transport_signature(self): + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout( + classified, + "nvidia/nemotron-3-ultra-550b-a55b", + "Error communicating with OpenAI: [Errno 32] Broken pipe", + ) is True + + @pytest.mark.parametrize("model,msg", [ + ("nvidia/nemotron-3-ultra-550b-a55b", "connection reset by peer"), + ("openai/o3-mini", "remote protocol error"), + ("anthropic/claude-opus-4-6", "peer closed connection"), + ("deepseek/deepseek-r1", "connection lost"), + ("x-ai/grok-4-fast-reasoning", "server disconnected"), + ]) + def test_known_reasoning_models_match(self, model, msg): + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout(classified, model, msg) is True + + @pytest.mark.parametrize("model", [ + "gpt-4o", + "claude-3-5-sonnet-20240620", + "llama-3.3-70b-instruct", + "qwen2-72b-instruct", + ]) + def test_non_reasoning_models_never_match(self, model): + """Non-reasoning models must always return False even with + matching transport signature — the guidance is + reasoning-specific.""" + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout( + classified, model, "connection reset by peer", + ) is False + + @pytest.mark.parametrize("reason", [ + "billing", + "rate_limit", + "auth", + "context_overflow", + "format_error", + "provider_policy_blocked", + "content_policy_blocked", + "thinking_signature", + "unknown", + ]) + def test_non_timeout_reasons_never_match(self, reason): + """The detection only fires when the classifier says timeout. + Other reasons have their own distinct guidance paths.""" + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason=reason) + assert is_thinking_timeout( + classified, + "nvidia/nemotron-3-ultra-550b-a55b", + "connection reset by peer", + ) is False + + @pytest.mark.parametrize("msg", [ + "Insufficient credits", + "Rate limit exceeded", + "Invalid API key", + "Context length exceeded", + "Tool call argument malformed", + ]) + def test_non_transport_messages_never_match(self, msg): + """The detection only fires for transport-kill signatures. + A reasoning model that returns a billing/rate-limit/auth/etc + error gets the classifier-default guidance, not this one.""" + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout( + classified, "nvidia/nemotron-3-ultra-550b-a55b", msg, + ) is False + + def test_empty_error_msg_returns_false(self): + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout( + classified, "nvidia/nemotron-3-ultra-550b-a55b", "", + ) is False + + def test_none_error_msg_returns_false(self): + from agent.thinking_timeout_guidance import is_thinking_timeout + classified = _classified(reason="timeout") + assert is_thinking_timeout( + classified, "nvidia/nemotron-3-ultra-550b-a55b", None, + ) is False + + +# ── Part 2: guidance text (agent/thinking_timeout_guidance.py:build_thinking_timeout_guidance) ── + + +class TestBuildThinkingTimeoutGuidance: + def test_guidance_mentions_config_path(self): + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance( + provider="nvidia", model="nvidia/nemotron-3-ultra-550b-a55b", + ) + assert "providers.nvidia.models.nvidia/nemotron-3-ultra-550b-a55b.stale_timeout_seconds" in text + + def test_guidance_mentions_three_workarounds(self): + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance(provider="nvidia", model="x") + assert "1." in text + assert "2." in text + assert "3." in text + + def test_guidance_mentions_known_providers(self): + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance(provider="nvidia", model="x") + # At least one of the known cloud providers should be mentioned + # to give the user context. + assert any(p in text for p in ( + "NVIDIA NIM", "OpenAI", "Anthropic", "DeepSeek", + )) + + def test_guidance_mentions_built_in_floor(self): + """User should know that 600s is already set by default for + known reasoning models — if they hit the error after raising, + it's the upstream cap, not hermes.""" + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance(provider="nvidia", model="x") + assert "600s" in text + + def test_guidance_does_not_recommend_execute_code(self): + """Critical regression guard: the new guidance must NOT + recommend `execute_code with Python's open() for large files` + — that's the misleading advice from the existing _is_stream_drop + guidance that was wrong for the thinking-timeout case.""" + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance(provider="nvidia", model="x") + assert "execute_code" not in text + assert "open()" not in text + + def test_guidance_uses_label_when_provided(self): + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance( + provider="nvidia", + model="nvidia/nemotron-3-ultra-550b-a55b", + model_label="Nemotron 3 Ultra", + ) + assert "Nemotron 3 Ultra" in text + + def test_guidance_falls_back_to_slug_when_no_label(self): + from agent.thinking_timeout_guidance import build_thinking_timeout_guidance + text = build_thinking_timeout_guidance( + provider="nvidia", + model="nvidia/nemotron-3-ultra-550b-a55b", + ) + assert "nvidia/nemotron-3-ultra-550b-a55b" in text