diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 14a2609d8..04875b6a5 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -220,12 +220,25 @@ _TRANSPORT_ERROR_TYPES = frozenset({ "ConnectionAbortedError", "BrokenPipeError", "TimeoutError", "ReadError", "ServerDisconnectedError", + # SSL/TLS transport errors — transient mid-stream handshake/record + # failures that should retry rather than surface as a stalled session. + # ssl.SSLError subclasses OSError (caught by isinstance) but we list + # the type names here so provider-wrapped SSL errors (e.g. when the + # SDK re-raises without preserving the exception chain) still classify + # as transport rather than falling through to the unknown bucket. + "SSLError", "SSLZeroReturnError", "SSLWantReadError", + "SSLWantWriteError", "SSLEOFError", "SSLSyscallError", # OpenAI SDK errors (not subclasses of Python builtins) "APIConnectionError", "APITimeoutError", }) -# Server disconnect patterns (no status code, but transport-level) +# Server disconnect patterns (no status code, but transport-level). +# These are the "ambiguous" patterns — a plain connection close could be +# transient transport hiccup OR server-side context overflow rejection +# (common when the API gateway disconnects instead of returning an HTTP +# error for oversized requests). A large session + one of these patterns +# triggers the context-overflow-with-compression recovery path. _SERVER_DISCONNECT_PATTERNS = [ "server disconnected", "peer closed connection", @@ -236,6 +249,40 @@ _SERVER_DISCONNECT_PATTERNS = [ "incomplete chunked read", ] +# SSL/TLS transient failure patterns — intentionally distinct from +# _SERVER_DISCONNECT_PATTERNS above. +# +# An SSL alert mid-stream is almost always a transport-layer hiccup +# (flaky network, mid-session TLS renegotiation failure, load balancer +# dropping the connection) — NOT a server-side context overflow signal. +# So we want the retry path but NOT the compression path; lumping these +# into _SERVER_DISCONNECT_PATTERNS would trigger unnecessary (and +# expensive) context compression on any large-session SSL hiccup. +# +# The OpenSSL library constructs error codes by prepending a format string +# to the uppercased alert reason; OpenSSL 3.x changed the separator +# (e.g. `SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`), +# which silently stopped matching anything explicit. Matching on the +# stable substrings (`bad record mac`, `ssl alert`, `tls alert`, etc.) +# survives future OpenSSL format churn without code changes. +_SSL_TRANSIENT_PATTERNS = [ + # Space-separated (human-readable form, Python ssl module, most SDKs) + "bad record mac", + "ssl alert", + "tls alert", + "ssl handshake failure", + "tlsv1 alert", + "sslv3 alert", + # Underscore-separated (OpenSSL error code tokens, e.g. + # `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC`, `SSLV3_ALERT_BAD_RECORD_MAC`) + "bad_record_mac", + "ssl_alert", + "tls_alert", + "tls_alert_internal_error", + # Python ssl module prefix, e.g. "[SSL: BAD_RECORD_MAC]" + "[ssl:", +] + # ── Classification pipeline ───────────────────────────────────────────── @@ -255,9 +302,10 @@ def classify_api_error( 2. HTTP status code + message-aware refinement 3. Error code classification (from body) 4. Message pattern matching (billing vs rate_limit vs context vs auth) - 5. Transport error heuristics + 5. SSL/TLS transient alert patterns → retry as timeout 6. Server disconnect + large session → context overflow - 7. Fallback: unknown (retryable with backoff) + 7. Transport error heuristics + 8. Fallback: unknown (retryable with backoff) Args: error: The exception from the API call. @@ -388,7 +436,18 @@ def classify_api_error( if classified is not None: return classified - # ── 5. Server disconnect + large session → context overflow ───── + # ── 5. SSL/TLS transient errors → retry as timeout (not compression) ── + # SSL alerts mid-stream are transport hiccups, not server-side context + # overflow signals. Classify before the disconnect check so a large + # session doesn't incorrectly trigger context compression when the real + # cause is a flaky TLS handshake. Also matches when the error is + # wrapped in a generic exception whose message string carries the SSL + # alert text but the type isn't ssl.SSLError (happens with some SDKs + # that re-raise without chaining). + if any(p in error_msg for p in _SSL_TRANSIENT_PATTERNS): + return _result(FailoverReason.timeout, retryable=True) + + # ── 6. Server disconnect + large session → context overflow ───── # Must come BEFORE generic transport error catch — a disconnect on # a large session is more likely context overflow than a transient # transport hiccup. Without this ordering, RemoteProtocolError @@ -405,12 +464,12 @@ def classify_api_error( ) return _result(FailoverReason.timeout, retryable=True) - # ── 6. Transport / timeout heuristics ─────────────────────────── + # ── 7. Transport / timeout heuristics ─────────────────────────── if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)): return _result(FailoverReason.timeout, retryable=True) - # ── 7. Fallback: unknown ──────────────────────────────────────── + # ── 8. Fallback: unknown ──────────────────────────────────────── return _result(FailoverReason.unknown, retryable=True) diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 44e7059a9..c8faffb0c 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -949,3 +949,94 @@ class TestAdversarialEdgeCases: e = MockAPIError("server error", status_code=500, body={"message": None}) result = classify_api_error(e) assert result is not None + + +# ── Test: SSL/TLS transient errors ───────────────────────────────────── + +class TestSSLTransientPatterns: + """SSL/TLS alerts mid-stream should retry as timeout, not unknown, and + should NOT trigger context compression even on a large session. + + Motivation: OpenSSL 3.x changed TLS alert error code format + (`SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`), + breaking string-exact matching in downstream retry logic. We match + stable substrings instead. + """ + + def test_bad_record_mac_classifies_as_timeout(self): + """OpenSSL 3.x mid-stream bad record mac alert.""" + e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2580)") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_openssl_3x_format_classifies_as_timeout(self): + """New format `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC` still matches + because we key on both space- and underscore-separated forms of + the stable `bad_record_mac` token.""" + e = Exception("ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC during streaming") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_tls_alert_internal_error_classifies_as_timeout(self): + e = Exception("[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_ssl_handshake_failure_classifies_as_timeout(self): + e = Exception("ssl handshake failure during mid-stream") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_ssl_prefix_classifies_as_timeout(self): + """Python's generic '[SSL: XYZ]' prefix from the ssl module.""" + e = Exception("[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_ssl_alert_on_large_session_does_not_compress(self): + """Critical: SSL alerts on big contexts must NOT trigger context + compression — compression is expensive and won't fix a transport + hiccup. This is why _SSL_TRANSIENT_PATTERNS is separate from + _SERVER_DISCONNECT_PATTERNS. + """ + e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac") + result = classify_api_error( + e, + approx_tokens=180000, # 90% of a 200k-context window + context_length=200000, + num_messages=300, + ) + assert result.reason == FailoverReason.timeout + assert result.should_compress is False + + def test_plain_disconnect_on_large_session_still_compresses(self): + """Regression guard: the context-overflow-via-disconnect path + (non-SSL disconnects on large sessions) must still trigger + compression. Only SSL-specific disconnects skip it. + """ + e = Exception("Server disconnected without sending a response") + result = classify_api_error( + e, + approx_tokens=180000, + context_length=200000, + num_messages=300, + ) + assert result.reason == FailoverReason.context_overflow + assert result.should_compress is True + + def test_real_ssl_error_type_classifies_as_timeout(self): + """Real ssl.SSLError instance — the type name alone (not message) + should route to the transport bucket.""" + import ssl + e = ssl.SSLError("arbitrary ssl error") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True