mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
Two-part fix:
Part 1 (classifier override at agent/error_classifier.py:720-738):
A transport disconnect on a reasoning model — even on a large session —
now routes to FailoverReason.timeout instead of context_overflow. Without
this, large-session reasoning-model disconnects route to the compression
branch and silently delete conversation history on a phantom
context-length error. The override is strictly targeted: non-reasoning
models (gpt-4o, claude-3-5-sonnet, llama-3.3-70b, etc.) still route to
context_overflow on large sessions — the existing intentional behavior
for chat models whose proxy doesn't idle-kill during prefill/generation.
Part 2 (new agent/thinking_timeout_guidance.py + integration at
agent/conversation_loop.py:3488-3567):
New is_thinking_timeout() and build_thinking_timeout_guidance() helpers.
When a known reasoning model (NVIDIA Nemotron 3 Ultra, OpenAI o1/o3,
Anthropic Opus 4.x thinking, DeepSeek R1, Qwen QwQ, xAI Grok reasoning)
hits a transport-kill on a small session (classifier says timeout
directly) or after Part 1 routes correctly (large session), the user
now sees reasoning-specific guidance with three actionable workarounds
in priority order:
1. Set providers.<provider>.models.<model>.stale_timeout_seconds: 900
in ~/.hermes/config.yaml (Hermes's built-in floor is already 600s
for known reasoning models; raise further if upstream is even
tighter).
2. Lower reasoning_budget or set reasoning_effort: medium on this
model if the provider supports it.
3. Use a smaller / faster reasoning model if the task doesn't
require deep thinking.
The new guidance takes precedence via if/elif over the existing
_is_stream_drop block, so a reasoning-model user with a transport-kill
message sees actionable advice instead of the misleading "try
execute_code with Python's open() for large files" advice (which is
correct for the unrelated large-file-write stream-drop case but
actively wrong for the thinking-timeout case).
Verified:
- 478 tests passing across 9 directly-relevant files (49 new + 429
existing, zero regressions).
- Ruff lint clean on all 4 modified/new files.
- Negative test: 6 parametrized regression guards confirm non-reasoning
models still route to context_overflow on large sessions; 4
parametrized gates confirm non-timeout classifier reasons never
trigger the guidance; 5 parametrized cases confirm non-transport
messages never trigger it.
- Regression guard: new guidance message does NOT contain
"execute_code" or "open()" — the misleading advice is fully
replaced, not appended alongside.
- Cross-vendor dual review via agy -p:
- Gemini 3.5 Flash (Medium) — passed: true, zero blockers, one
SHOULD-FIX (vprint block duplication — fixed by extracting
detection into a helper module).
- GPT-OSS 120B (Medium) — passed: true, zero blockers, two nits
(test placement — adopted at tests/agent/test_thinking_timeout_guidance.py;
primary-model capture — accepted as non-issue per Flash's nit).
Dependency note for maintainers:
This PR includes agent/reasoning_timeouts.py (the reasoning-model
allowlist module from PR #52238) because the Layer 1 override is
load-bearing on get_reasoning_stale_timeout_floor(). After PR #52238
lands on main, this PR's duplicate agent/reasoning_timeouts.py should
be rebased away. Either PR can land first; the other rebase is
mechanical.
Fixes #52271.
355 lines
15 KiB
Python
355 lines
15 KiB
Python
"""Tests for the reasoning-model thinking-timeout detection + guidance.
|
|
|
|
Two layers:
|
|
|
|
1. **Classifier override (Part 1, ``agent/error_classifier.py:720-738``)**:
|
|
A transport disconnect on a reasoning model is reclassified as
|
|
``FailoverReason.timeout`` even when the session is large — instead
|
|
of routing to the compression branch via
|
|
``FailoverReason.context_overflow`` which would silently delete
|
|
conversation history on a phantom context-length error.
|
|
|
|
2. **Detection + guidance (Part 2, ``agent/thinking_timeout_guidance.py``)**:
|
|
When the classifier says timeout AND the model is in the reasoning
|
|
allowlist AND the error message has a transport-kill signature,
|
|
the user gets actionable guidance (raise stale_timeout, lower
|
|
reasoning_budget, or switch models) instead of the misleading
|
|
"use execute_code with Python's open() for large files" advice
|
|
that fires for the unrelated large-file-write stream-drop case.
|
|
|
|
Both behaviors were previously broken: the existing
|
|
``test_disconnect_large_session_context_overflow`` test in
|
|
``tests/agent/test_error_classifier.py`` confirms that non-reasoning
|
|
models still route to context_overflow on a large session, so the
|
|
reasoning-model override is strictly targeted.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
|
|
# ── helpers ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
class _TimeoutReason:
|
|
"""Minimal FailoverReason stand-in for unit tests."""
|
|
|
|
def __init__(self, value: str = "timeout") -> None:
|
|
self.value = value
|
|
|
|
|
|
def _classified(reason: str = "timeout", **kwargs) -> SimpleNamespace:
|
|
"""Construct a ClassifiedError stand-in with the given reason."""
|
|
defaults = dict(
|
|
reason=_TimeoutReason(reason),
|
|
status_code=None,
|
|
retryable=True,
|
|
should_compress=False,
|
|
should_rotate_credential=False,
|
|
should_fallback=False,
|
|
)
|
|
defaults.update(kwargs)
|
|
return SimpleNamespace(**defaults)
|
|
|
|
|
|
# ── Part 1: classifier override (agent/error_classifier.py:720-738) ──
|
|
|
|
|
|
def _make_session(disconnect_message: str, model: str, *, num_messages: int = 250):
|
|
"""Construct inputs to classify_api_error for a disconnect+large-session case."""
|
|
e = Exception(disconnect_message)
|
|
# 128k context_length; 130k approx_tokens puts us over 0.6 of context
|
|
# AND > 120k absolute threshold; 250 messages is also > 200 threshold.
|
|
# Without the reasoning-model override, this routes to context_overflow.
|
|
return e, {
|
|
"provider": "nvidia",
|
|
"model": model,
|
|
"approx_tokens": 130_000,
|
|
"context_length": 200_000,
|
|
"num_messages": num_messages,
|
|
}
|
|
|
|
|
|
class TestClassifierOverride:
|
|
"""The reasoning-model override at error_classifier.py:720-738.
|
|
|
|
Verifies the new behavior: a transport disconnect on a reasoning
|
|
model on a LARGE session now routes to FailoverReason.timeout
|
|
instead of context_overflow. Without this fix, the compression
|
|
branch would fire on a phantom overflow and silently delete
|
|
conversation history.
|
|
"""
|
|
|
|
def test_reasoning_model_disconnect_on_large_session_is_timeout(self):
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e, kwargs = _make_session(
|
|
"server disconnected without sending complete message",
|
|
model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
)
|
|
result = classify_api_error(e, **kwargs)
|
|
assert result.reason == FailoverReason.timeout, (
|
|
"Reasoning-model transport disconnect on a large session "
|
|
"should route to FailoverReason.timeout (not "
|
|
"context_overflow) — the upstream proxy idle-kill is far "
|
|
"more likely than a true context-length error on a "
|
|
"thinking model."
|
|
)
|
|
assert result.should_compress is False, (
|
|
"Compression would silently delete conversation history on "
|
|
"a phantom overflow — must not fire for reasoning models."
|
|
)
|
|
|
|
@pytest.mark.parametrize("model", [
|
|
"nvidia/nemotron-3-ultra-550b-a55b",
|
|
"openai/o3-mini",
|
|
"anthropic/claude-opus-4-6",
|
|
"deepseek/deepseek-r1",
|
|
"qwen/qwq-32b-preview",
|
|
"x-ai/grok-4-fast-reasoning",
|
|
])
|
|
def test_all_known_reasoning_models_override(self, model):
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e, kwargs = _make_session(
|
|
"server disconnected without sending complete message",
|
|
model=model,
|
|
)
|
|
result = classify_api_error(e, **kwargs)
|
|
assert result.reason == FailoverReason.timeout
|
|
assert result.should_compress is False
|
|
|
|
def test_non_reasoning_model_large_session_still_routes_to_context_overflow(self):
|
|
"""Regression guard: existing test_disconnect_large_session_context_overflow
|
|
behavior must be preserved for non-reasoning models.
|
|
|
|
Without the override, this case routes to context_overflow +
|
|
should_compress=True (the existing, intentional behavior for
|
|
chat models that hit true context-length errors via proxy
|
|
disconnect). With the override, it stays that way.
|
|
"""
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e, kwargs = _make_session(
|
|
"server disconnected without sending complete message",
|
|
model="gpt-4o",
|
|
)
|
|
result = classify_api_error(e, **kwargs)
|
|
assert result.reason == FailoverReason.context_overflow
|
|
assert result.should_compress is True
|
|
|
|
@pytest.mark.parametrize("model", [
|
|
"olmo-1",
|
|
"gpt-4o",
|
|
"claude-3-5-sonnet-20240620",
|
|
"llama-3.3-70b-instruct",
|
|
"qwen2-72b-instruct",
|
|
"x-ai/grok-3",
|
|
])
|
|
def test_non_reasoning_models_all_keep_context_overflow(self, model):
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e, kwargs = _make_session(
|
|
"server disconnected without sending complete message",
|
|
model=model,
|
|
)
|
|
result = classify_api_error(e, **kwargs)
|
|
assert result.reason == FailoverReason.context_overflow
|
|
|
|
def test_reasoning_model_small_session_still_routes_to_timeout(self):
|
|
"""Sanity check: a reasoning model with a SMALL session also
|
|
routes to timeout (the original behavior, unchanged by the
|
|
override since the override's result matches the small-session
|
|
branch's result)."""
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e = Exception("server disconnected")
|
|
result = classify_api_error(
|
|
e,
|
|
model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
approx_tokens=5_000,
|
|
context_length=200_000,
|
|
num_messages=10,
|
|
)
|
|
assert result.reason == FailoverReason.timeout
|
|
|
|
def test_reasoning_model_with_status_code_does_not_match_disconnect_pattern(self):
|
|
"""Status-code errors take the HTTP-status path in the
|
|
classifier, not the disconnect-with-large-session path.
|
|
The reasoning-model override is INSIDE the disconnect branch
|
|
and doesn't fire for HTTP errors."""
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
e = Exception("server disconnected")
|
|
# Inject a status_code attribute to simulate an HTTP error
|
|
# whose message happens to contain "server disconnected".
|
|
e.status_code = 503
|
|
result = classify_api_error(
|
|
e,
|
|
model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
approx_tokens=130_000,
|
|
context_length=200_000,
|
|
num_messages=250,
|
|
)
|
|
# 503 specifically routes to overloaded (per the 5xx → 503/529
|
|
# handling in error_classifier.py). The key assertion is that
|
|
# the reasoning-model override is NOT reached — neither
|
|
# timeout nor context_overflow.
|
|
assert result.reason != FailoverReason.timeout
|
|
assert result.reason != FailoverReason.context_overflow
|
|
assert result.should_compress is False
|
|
|
|
|
|
# ── Part 2: detection (agent/thinking_timeout_guidance.py:is_thinking_timeout) ──
|
|
|
|
|
|
class TestIsThinkingTimeout:
|
|
def test_returns_true_for_reasoning_model_with_transport_signature(self):
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(
|
|
classified,
|
|
"nvidia/nemotron-3-ultra-550b-a55b",
|
|
"Error communicating with OpenAI: [Errno 32] Broken pipe",
|
|
) is True
|
|
|
|
@pytest.mark.parametrize("model,msg", [
|
|
("nvidia/nemotron-3-ultra-550b-a55b", "connection reset by peer"),
|
|
("openai/o3-mini", "remote protocol error"),
|
|
("anthropic/claude-opus-4-6", "peer closed connection"),
|
|
("deepseek/deepseek-r1", "connection lost"),
|
|
("x-ai/grok-4-fast-reasoning", "server disconnected"),
|
|
])
|
|
def test_known_reasoning_models_match(self, model, msg):
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(classified, model, msg) is True
|
|
|
|
@pytest.mark.parametrize("model", [
|
|
"gpt-4o",
|
|
"claude-3-5-sonnet-20240620",
|
|
"llama-3.3-70b-instruct",
|
|
"qwen2-72b-instruct",
|
|
])
|
|
def test_non_reasoning_models_never_match(self, model):
|
|
"""Non-reasoning models must always return False even with
|
|
matching transport signature — the guidance is
|
|
reasoning-specific."""
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(
|
|
classified, model, "connection reset by peer",
|
|
) is False
|
|
|
|
@pytest.mark.parametrize("reason", [
|
|
"billing",
|
|
"rate_limit",
|
|
"auth",
|
|
"context_overflow",
|
|
"format_error",
|
|
"provider_policy_blocked",
|
|
"content_policy_blocked",
|
|
"thinking_signature",
|
|
"unknown",
|
|
])
|
|
def test_non_timeout_reasons_never_match(self, reason):
|
|
"""The detection only fires when the classifier says timeout.
|
|
Other reasons have their own distinct guidance paths."""
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason=reason)
|
|
assert is_thinking_timeout(
|
|
classified,
|
|
"nvidia/nemotron-3-ultra-550b-a55b",
|
|
"connection reset by peer",
|
|
) is False
|
|
|
|
@pytest.mark.parametrize("msg", [
|
|
"Insufficient credits",
|
|
"Rate limit exceeded",
|
|
"Invalid API key",
|
|
"Context length exceeded",
|
|
"Tool call argument malformed",
|
|
])
|
|
def test_non_transport_messages_never_match(self, msg):
|
|
"""The detection only fires for transport-kill signatures.
|
|
A reasoning model that returns a billing/rate-limit/auth/etc
|
|
error gets the classifier-default guidance, not this one."""
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(
|
|
classified, "nvidia/nemotron-3-ultra-550b-a55b", msg,
|
|
) is False
|
|
|
|
def test_empty_error_msg_returns_false(self):
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(
|
|
classified, "nvidia/nemotron-3-ultra-550b-a55b", "",
|
|
) is False
|
|
|
|
def test_none_error_msg_returns_false(self):
|
|
from agent.thinking_timeout_guidance import is_thinking_timeout
|
|
classified = _classified(reason="timeout")
|
|
assert is_thinking_timeout(
|
|
classified, "nvidia/nemotron-3-ultra-550b-a55b", None,
|
|
) is False
|
|
|
|
|
|
# ── Part 2: guidance text (agent/thinking_timeout_guidance.py:build_thinking_timeout_guidance) ──
|
|
|
|
|
|
class TestBuildThinkingTimeoutGuidance:
|
|
def test_guidance_mentions_config_path(self):
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(
|
|
provider="nvidia", model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
)
|
|
assert "providers.nvidia.models.nvidia/nemotron-3-ultra-550b-a55b.stale_timeout_seconds" in text
|
|
|
|
def test_guidance_mentions_three_workarounds(self):
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(provider="nvidia", model="x")
|
|
assert "1." in text
|
|
assert "2." in text
|
|
assert "3." in text
|
|
|
|
def test_guidance_mentions_known_providers(self):
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(provider="nvidia", model="x")
|
|
# At least one of the known cloud providers should be mentioned
|
|
# to give the user context.
|
|
assert any(p in text for p in (
|
|
"NVIDIA NIM", "OpenAI", "Anthropic", "DeepSeek",
|
|
))
|
|
|
|
def test_guidance_mentions_built_in_floor(self):
|
|
"""User should know that 600s is already set by default for
|
|
known reasoning models — if they hit the error after raising,
|
|
it's the upstream cap, not hermes."""
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(provider="nvidia", model="x")
|
|
assert "600s" in text
|
|
|
|
def test_guidance_does_not_recommend_execute_code(self):
|
|
"""Critical regression guard: the new guidance must NOT
|
|
recommend `execute_code with Python's open() for large files`
|
|
— that's the misleading advice from the existing _is_stream_drop
|
|
guidance that was wrong for the thinking-timeout case."""
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(provider="nvidia", model="x")
|
|
assert "execute_code" not in text
|
|
assert "open()" not in text
|
|
|
|
def test_guidance_uses_label_when_provided(self):
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(
|
|
provider="nvidia",
|
|
model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
model_label="Nemotron 3 Ultra",
|
|
)
|
|
assert "Nemotron 3 Ultra" in text
|
|
|
|
def test_guidance_falls_back_to_slug_when_no_label(self):
|
|
from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
|
|
text = build_thinking_timeout_guidance(
|
|
provider="nvidia",
|
|
model="nvidia/nemotron-3-ultra-550b-a55b",
|
|
)
|
|
assert "nvidia/nemotron-3-ultra-550b-a55b" in text
|