diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 86e99ec1ac..67feaa4304 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -520,7 +520,12 @@ def classify_api_error( is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS) if is_disconnect and not status_code: - is_large = approx_tokens > context_length * 0.6 or approx_tokens > 120000 or num_messages > 200 + # Absolute token/message-count thresholds are only a proxy for smaller + # context windows. Large-context sessions can have hundreds of + # messages while still being far below their actual token budget. + is_large = approx_tokens > context_length * 0.6 or ( + context_length <= 256000 and (approx_tokens > 120000 or num_messages > 200) + ) if is_large: return _result( FailoverReason.context_overflow, @@ -766,7 +771,12 @@ def _classify_400( if not err_body_msg: err_body_msg = str(body.get("message") or "").strip().lower() is_generic = len(err_body_msg) < 30 or err_body_msg in ("error", "") - is_large = approx_tokens > context_length * 0.4 or approx_tokens > 80000 or num_messages > 80 + # Absolute token/message-count thresholds are only a proxy for smaller + # context windows. Large-context sessions can have many messages while + # still being far below their actual token budget. + is_large = approx_tokens > context_length * 0.4 or ( + context_length <= 256000 and (approx_tokens > 80000 or num_messages > 80) + ) if is_generic and is_large: return result_fn( diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 9d52c7bdf2..5a28797349 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -410,6 +410,24 @@ class TestClassifyApiError: result = classify_api_error(e, approx_tokens=1000, context_length=200000) assert result.reason == FailoverReason.format_error + def test_400_generic_many_messages_below_large_context_pressure_is_format_error(self): + """Large-context sessions should not overflow solely due to message count.""" + e = MockAPIError( + "Error", + status_code=400, + body={"error": {"message": "Error"}}, + ) + result = classify_api_error( + e, + provider="openai-codex", + model="gpt-5.5", + approx_tokens=74320, + context_length=1_000_000, + num_messages=432, + ) + assert result.reason == FailoverReason.format_error + assert result.should_compress is False + # ── Server disconnect + large session ── def test_disconnect_large_session_context_overflow(self): @@ -425,6 +443,20 @@ class TestClassifyApiError: result = classify_api_error(e, approx_tokens=5000, context_length=200000) assert result.reason == FailoverReason.timeout + def test_disconnect_many_messages_below_large_context_pressure_is_timeout(self): + """Large-context disconnects should not overflow solely due to message count.""" + e = Exception("server disconnected without sending complete message") + result = classify_api_error( + e, + provider="openai-codex", + model="gpt-5.5", + approx_tokens=74320, + context_length=1_000_000, + num_messages=432, + ) + assert result.reason == FailoverReason.timeout + assert result.should_compress is False + # ── Provider-specific: Anthropic thinking signature ── def test_anthropic_thinking_signature(self):