hermes-agent/tests/agent/test_thinking_timeout_guidance.py

"""Tests for the reasoning-model thinking-timeout detection + guidance.

Two layers:

1. **Classifier override (Part 1, ``agent/error_classifier.py:720-738``)**:
   A transport disconnect on a reasoning model is reclassified as
   ``FailoverReason.timeout`` even when the session is large — instead
   of routing to the compression branch via
   ``FailoverReason.context_overflow`` which would silently delete
   conversation history on a phantom context-length error.

2. **Detection + guidance (Part 2, ``agent/thinking_timeout_guidance.py``)**:
   When the classifier says timeout AND the model is in the reasoning
   allowlist AND the error message has a transport-kill signature,
   the user gets actionable guidance (raise stale_timeout, lower
   reasoning_budget, or switch models) instead of the misleading
   "use execute_code with Python's open() for large files" advice
   that fires for the unrelated large-file-write stream-drop case.

Both behaviors were previously broken: the existing
``test_disconnect_large_session_context_overflow`` test in
``tests/agent/test_error_classifier.py`` confirms that non-reasoning
models still route to context_overflow on a large session, so the
reasoning-model override is strictly targeted.
"""

from __future__ import annotations

from types import SimpleNamespace

import pytest


# ── helpers ──────────────────────────────────────────────────────────────


class _TimeoutReason:
    """Minimal FailoverReason stand-in for unit tests."""

    def __init__(self, value: str = "timeout") -> None:
        self.value = value


def _classified(reason: str = "timeout", **kwargs) -> SimpleNamespace:
    """Construct a ClassifiedError stand-in with the given reason."""
    defaults = dict(
        reason=_TimeoutReason(reason),
        status_code=None,
        retryable=True,
        should_compress=False,
        should_rotate_credential=False,
        should_fallback=False,
    )
    defaults.update(kwargs)
    return SimpleNamespace(**defaults)


# ── Part 1: classifier override (agent/error_classifier.py:720-738) ──


def _make_session(disconnect_message: str, model: str, *, num_messages: int = 250):
    """Construct inputs to classify_api_error for a disconnect+large-session case."""
    e = Exception(disconnect_message)
    # 128k context_length; 130k approx_tokens puts us over 0.6 of context
    # AND > 120k absolute threshold; 250 messages is also > 200 threshold.
    # Without the reasoning-model override, this routes to context_overflow.
    return e, {
        "provider": "nvidia",
        "model": model,
        "approx_tokens": 130_000,
        "context_length": 200_000,
        "num_messages": num_messages,
    }


class TestClassifierOverride:
    """The reasoning-model override at error_classifier.py:720-738.

    Verifies the new behavior: a transport disconnect on a reasoning
    model on a LARGE session now routes to FailoverReason.timeout
    instead of context_overflow.  Without this fix, the compression
    branch would fire on a phantom overflow and silently delete
    conversation history.
    """

    def test_reasoning_model_disconnect_on_large_session_is_timeout(self):
        from agent.error_classifier import classify_api_error, FailoverReason
        e, kwargs = _make_session(
            "server disconnected without sending complete message",
            model="nvidia/nemotron-3-ultra-550b-a55b",
        )
        result = classify_api_error(e, **kwargs)
        assert result.reason == FailoverReason.timeout, (
            "Reasoning-model transport disconnect on a large session "
            "should route to FailoverReason.timeout (not "
            "context_overflow) — the upstream proxy idle-kill is far "
            "more likely than a true context-length error on a "
            "thinking model."
        )
        assert result.should_compress is False, (
            "Compression would silently delete conversation history on "
            "a phantom overflow — must not fire for reasoning models."
        )

    @pytest.mark.parametrize("model", [
        "nvidia/nemotron-3-ultra-550b-a55b",
        "openai/o3-mini",
        "anthropic/claude-opus-4-6",
        "deepseek/deepseek-r1",
        "qwen/qwq-32b-preview",
        "x-ai/grok-4-fast-reasoning",
    ])
    def test_all_known_reasoning_models_override(self, model):
        from agent.error_classifier import classify_api_error, FailoverReason
        e, kwargs = _make_session(
            "server disconnected without sending complete message",
            model=model,
        )
        result = classify_api_error(e, **kwargs)
        assert result.reason == FailoverReason.timeout
        assert result.should_compress is False

    def test_non_reasoning_model_large_session_still_routes_to_context_overflow(self):
        """Regression guard: existing test_disconnect_large_session_context_overflow
        behavior must be preserved for non-reasoning models.

        Without the override, this case routes to context_overflow +
        should_compress=True (the existing, intentional behavior for
        chat models that hit true context-length errors via proxy
        disconnect).  With the override, it stays that way.
        """
        from agent.error_classifier import classify_api_error, FailoverReason
        e, kwargs = _make_session(
            "server disconnected without sending complete message",
            model="gpt-4o",
        )
        result = classify_api_error(e, **kwargs)
        assert result.reason == FailoverReason.context_overflow
        assert result.should_compress is True

    @pytest.mark.parametrize("model", [
        "olmo-1",
        "gpt-4o",
        "claude-3-5-sonnet-20240620",
        "llama-3.3-70b-instruct",
        "qwen2-72b-instruct",
        "x-ai/grok-3",
    ])
    def test_non_reasoning_models_all_keep_context_overflow(self, model):
        from agent.error_classifier import classify_api_error, FailoverReason
        e, kwargs = _make_session(
            "server disconnected without sending complete message",
            model=model,
        )
        result = classify_api_error(e, **kwargs)
        assert result.reason == FailoverReason.context_overflow

    def test_reasoning_model_small_session_still_routes_to_timeout(self):
        """Sanity check: a reasoning model with a SMALL session also
        routes to timeout (the original behavior, unchanged by the
        override since the override's result matches the small-session
        branch's result)."""
        from agent.error_classifier import classify_api_error, FailoverReason
        e = Exception("server disconnected")
        result = classify_api_error(
            e,
            model="nvidia/nemotron-3-ultra-550b-a55b",
            approx_tokens=5_000,
            context_length=200_000,
            num_messages=10,
        )
        assert result.reason == FailoverReason.timeout

    def test_reasoning_model_with_status_code_does_not_match_disconnect_pattern(self):
        """Status-code errors take the HTTP-status path in the
        classifier, not the disconnect-with-large-session path.
        The reasoning-model override is INSIDE the disconnect branch
        and doesn't fire for HTTP errors."""
        from agent.error_classifier import classify_api_error, FailoverReason
        e = Exception("server disconnected")
        # Inject a status_code attribute to simulate an HTTP error
        # whose message happens to contain "server disconnected".
        e.status_code = 503
        result = classify_api_error(
            e,
            model="nvidia/nemotron-3-ultra-550b-a55b",
            approx_tokens=130_000,
            context_length=200_000,
            num_messages=250,
        )
        # 503 specifically routes to overloaded (per the 5xx → 503/529
        # handling in error_classifier.py). The key assertion is that
        # the reasoning-model override is NOT reached — neither
        # timeout nor context_overflow.
        assert result.reason != FailoverReason.timeout
        assert result.reason != FailoverReason.context_overflow
        assert result.should_compress is False


# ── Part 2: detection (agent/thinking_timeout_guidance.py:is_thinking_timeout) ──


class TestIsThinkingTimeout:
    def test_returns_true_for_reasoning_model_with_transport_signature(self):
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(
            classified,
            "nvidia/nemotron-3-ultra-550b-a55b",
            "Error communicating with OpenAI: [Errno 32] Broken pipe",
        ) is True

    @pytest.mark.parametrize("model,msg", [
        ("nvidia/nemotron-3-ultra-550b-a55b", "connection reset by peer"),
        ("openai/o3-mini", "remote protocol error"),
        ("anthropic/claude-opus-4-6", "peer closed connection"),
        ("deepseek/deepseek-r1", "connection lost"),
        ("x-ai/grok-4-fast-reasoning", "server disconnected"),
    ])
    def test_known_reasoning_models_match(self, model, msg):
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(classified, model, msg) is True

    @pytest.mark.parametrize("model", [
        "gpt-4o",
        "claude-3-5-sonnet-20240620",
        "llama-3.3-70b-instruct",
        "qwen2-72b-instruct",
    ])
    def test_non_reasoning_models_never_match(self, model):
        """Non-reasoning models must always return False even with
        matching transport signature — the guidance is
        reasoning-specific."""
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(
            classified, model, "connection reset by peer",
        ) is False

    @pytest.mark.parametrize("reason", [
        "billing",
        "rate_limit",
        "auth",
        "context_overflow",
        "format_error",
        "provider_policy_blocked",
        "content_policy_blocked",
        "thinking_signature",
        "unknown",
    ])
    def test_non_timeout_reasons_never_match(self, reason):
        """The detection only fires when the classifier says timeout.
        Other reasons have their own distinct guidance paths."""
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason=reason)
        assert is_thinking_timeout(
            classified,
            "nvidia/nemotron-3-ultra-550b-a55b",
            "connection reset by peer",
        ) is False

    @pytest.mark.parametrize("msg", [
        "Insufficient credits",
        "Rate limit exceeded",
        "Invalid API key",
        "Context length exceeded",
        "Tool call argument malformed",
    ])
    def test_non_transport_messages_never_match(self, msg):
        """The detection only fires for transport-kill signatures.
        A reasoning model that returns a billing/rate-limit/auth/etc
        error gets the classifier-default guidance, not this one."""
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(
            classified, "nvidia/nemotron-3-ultra-550b-a55b", msg,
        ) is False

    def test_empty_error_msg_returns_false(self):
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(
            classified, "nvidia/nemotron-3-ultra-550b-a55b", "",
        ) is False

    def test_none_error_msg_returns_false(self):
        from agent.thinking_timeout_guidance import is_thinking_timeout
        classified = _classified(reason="timeout")
        assert is_thinking_timeout(
            classified, "nvidia/nemotron-3-ultra-550b-a55b", None,
        ) is False


# ── Part 2: guidance text (agent/thinking_timeout_guidance.py:build_thinking_timeout_guidance) ──


class TestBuildThinkingTimeoutGuidance:
    def test_guidance_mentions_config_path(self):
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(
            provider="nvidia", model="nvidia/nemotron-3-ultra-550b-a55b",
        )
        assert "providers.nvidia.models.nvidia/nemotron-3-ultra-550b-a55b.stale_timeout_seconds" in text

    def test_guidance_mentions_three_workarounds(self):
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(provider="nvidia", model="x")
        assert "1." in text
        assert "2." in text
        assert "3." in text

    def test_guidance_mentions_known_providers(self):
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(provider="nvidia", model="x")
        # At least one of the known cloud providers should be mentioned
        # to give the user context.
        assert any(p in text for p in (
            "NVIDIA NIM", "OpenAI", "Anthropic", "DeepSeek",
        ))

    def test_guidance_mentions_built_in_floor(self):
        """User should know that 600s is already set by default for
        known reasoning models — if they hit the error after raising,
        it's the upstream cap, not hermes."""
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(provider="nvidia", model="x")
        assert "600s" in text

    def test_guidance_does_not_recommend_execute_code(self):
        """Critical regression guard: the new guidance must NOT
        recommend `execute_code with Python's open() for large files`
        — that's the misleading advice from the existing _is_stream_drop
        guidance that was wrong for the thinking-timeout case."""
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(provider="nvidia", model="x")
        assert "execute_code" not in text
        assert "open()" not in text

    def test_guidance_uses_label_when_provided(self):
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(
            provider="nvidia",
            model="nvidia/nemotron-3-ultra-550b-a55b",
            model_label="Nemotron 3 Ultra",
        )
        assert "Nemotron 3 Ultra" in text

    def test_guidance_falls_back_to_slug_when_no_label(self):
        from agent.thinking_timeout_guidance import build_thinking_timeout_guidance
        text = build_thinking_timeout_guidance(
            provider="nvidia",
            model="nvidia/nemotron-3-ultra-550b-a55b",
        )
        assert "nvidia/nemotron-3-ultra-550b-a55b" in text