From b40b6ec720df05786b865579d3a094efe9e481a2 Mon Sep 17 00:00:00 2001 From: Teknium Date: Wed, 22 Apr 2026 17:07:12 -0700 Subject: [PATCH] fix(error_classifier): retry mid-stream SSL/TLS alert errors as transport MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mid-stream SSL alerts (bad_record_mac, tls_alert_internal_error, handshake failures) previously fell through the classifier pipeline to the 'unknown' bucket because: - ssl.SSLError type names weren't in _TRANSPORT_ERROR_TYPES (the isinstance(OSError) catch picks up some but not all SDK-wrapped forms) - the message-pattern list had no SSL alert substrings The 'unknown' bucket is still retryable, but: (a) logs tell the user 'unknown' instead of identifying the cause, (b) it bypasses the transport-specific backoff/fallback logic, and (c) if the SSL error happens on a large session with a generic 'connection closed' wrapper, the existing disconnect-on-large-session heuristic would incorrectly trigger context compression — expensive, and never fixes a transport hiccup. Changes: - Add ssl.SSLError and its subclass type names to _TRANSPORT_ERROR_TYPES - New _SSL_TRANSIENT_PATTERNS list (separate from _SERVER_DISCONNECT_PATTERNS so SSL alerts route to timeout, not context_overflow+compress) - New step 5 in the classifier pipeline: SSL pattern check runs BEFORE the disconnect check to pre-empt the large-session-compress path Patterns cover both space-separated ('ssl alert', 'bad record mac') and underscore-separated ('ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC') forms. This is load-bearing because OpenSSL 3.x changed the error-code separator from underscore to slash (e.g. SSLV3_ALERT_BAD_RECORD_MAC → SSL/TLS_ALERT_BAD_RECORD_MAC) and will likely churn again — matching on stable alert reason substrings survives future format changes. Tests (8 new): - BAD_RECORD_MAC in Python ssl.c format - OpenSSL 3.x underscore format - TLSV1_ALERT_INTERNAL_ERROR - ssl handshake failure - [SSL: ...] prefix fallback - Real ssl.SSLError instance - REGRESSION GUARD: SSL on large session does NOT compress - REGRESSION GUARD: plain disconnect on large session STILL compresses --- agent/error_classifier.py | 71 ++++++++++++++++++++-- tests/agent/test_error_classifier.py | 91 ++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 6 deletions(-) diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 14a2609d8..04875b6a5 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -220,12 +220,25 @@ _TRANSPORT_ERROR_TYPES = frozenset({ "ConnectionAbortedError", "BrokenPipeError", "TimeoutError", "ReadError", "ServerDisconnectedError", + # SSL/TLS transport errors — transient mid-stream handshake/record + # failures that should retry rather than surface as a stalled session. + # ssl.SSLError subclasses OSError (caught by isinstance) but we list + # the type names here so provider-wrapped SSL errors (e.g. when the + # SDK re-raises without preserving the exception chain) still classify + # as transport rather than falling through to the unknown bucket. + "SSLError", "SSLZeroReturnError", "SSLWantReadError", + "SSLWantWriteError", "SSLEOFError", "SSLSyscallError", # OpenAI SDK errors (not subclasses of Python builtins) "APIConnectionError", "APITimeoutError", }) -# Server disconnect patterns (no status code, but transport-level) +# Server disconnect patterns (no status code, but transport-level). +# These are the "ambiguous" patterns — a plain connection close could be +# transient transport hiccup OR server-side context overflow rejection +# (common when the API gateway disconnects instead of returning an HTTP +# error for oversized requests). A large session + one of these patterns +# triggers the context-overflow-with-compression recovery path. _SERVER_DISCONNECT_PATTERNS = [ "server disconnected", "peer closed connection", @@ -236,6 +249,40 @@ _SERVER_DISCONNECT_PATTERNS = [ "incomplete chunked read", ] +# SSL/TLS transient failure patterns — intentionally distinct from +# _SERVER_DISCONNECT_PATTERNS above. +# +# An SSL alert mid-stream is almost always a transport-layer hiccup +# (flaky network, mid-session TLS renegotiation failure, load balancer +# dropping the connection) — NOT a server-side context overflow signal. +# So we want the retry path but NOT the compression path; lumping these +# into _SERVER_DISCONNECT_PATTERNS would trigger unnecessary (and +# expensive) context compression on any large-session SSL hiccup. +# +# The OpenSSL library constructs error codes by prepending a format string +# to the uppercased alert reason; OpenSSL 3.x changed the separator +# (e.g. `SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`), +# which silently stopped matching anything explicit. Matching on the +# stable substrings (`bad record mac`, `ssl alert`, `tls alert`, etc.) +# survives future OpenSSL format churn without code changes. +_SSL_TRANSIENT_PATTERNS = [ + # Space-separated (human-readable form, Python ssl module, most SDKs) + "bad record mac", + "ssl alert", + "tls alert", + "ssl handshake failure", + "tlsv1 alert", + "sslv3 alert", + # Underscore-separated (OpenSSL error code tokens, e.g. + # `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC`, `SSLV3_ALERT_BAD_RECORD_MAC`) + "bad_record_mac", + "ssl_alert", + "tls_alert", + "tls_alert_internal_error", + # Python ssl module prefix, e.g. "[SSL: BAD_RECORD_MAC]" + "[ssl:", +] + # ── Classification pipeline ───────────────────────────────────────────── @@ -255,9 +302,10 @@ def classify_api_error( 2. HTTP status code + message-aware refinement 3. Error code classification (from body) 4. Message pattern matching (billing vs rate_limit vs context vs auth) - 5. Transport error heuristics + 5. SSL/TLS transient alert patterns → retry as timeout 6. Server disconnect + large session → context overflow - 7. Fallback: unknown (retryable with backoff) + 7. Transport error heuristics + 8. Fallback: unknown (retryable with backoff) Args: error: The exception from the API call. @@ -388,7 +436,18 @@ def classify_api_error( if classified is not None: return classified - # ── 5. Server disconnect + large session → context overflow ───── + # ── 5. SSL/TLS transient errors → retry as timeout (not compression) ── + # SSL alerts mid-stream are transport hiccups, not server-side context + # overflow signals. Classify before the disconnect check so a large + # session doesn't incorrectly trigger context compression when the real + # cause is a flaky TLS handshake. Also matches when the error is + # wrapped in a generic exception whose message string carries the SSL + # alert text but the type isn't ssl.SSLError (happens with some SDKs + # that re-raise without chaining). + if any(p in error_msg for p in _SSL_TRANSIENT_PATTERNS): + return _result(FailoverReason.timeout, retryable=True) + + # ── 6. Server disconnect + large session → context overflow ───── # Must come BEFORE generic transport error catch — a disconnect on # a large session is more likely context overflow than a transient # transport hiccup. Without this ordering, RemoteProtocolError @@ -405,12 +464,12 @@ def classify_api_error( ) return _result(FailoverReason.timeout, retryable=True) - # ── 6. Transport / timeout heuristics ─────────────────────────── + # ── 7. Transport / timeout heuristics ─────────────────────────── if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)): return _result(FailoverReason.timeout, retryable=True) - # ── 7. Fallback: unknown ──────────────────────────────────────── + # ── 8. Fallback: unknown ──────────────────────────────────────── return _result(FailoverReason.unknown, retryable=True) diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 44e7059a9..c8faffb0c 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -949,3 +949,94 @@ class TestAdversarialEdgeCases: e = MockAPIError("server error", status_code=500, body={"message": None}) result = classify_api_error(e) assert result is not None + + +# ── Test: SSL/TLS transient errors ───────────────────────────────────── + +class TestSSLTransientPatterns: + """SSL/TLS alerts mid-stream should retry as timeout, not unknown, and + should NOT trigger context compression even on a large session. + + Motivation: OpenSSL 3.x changed TLS alert error code format + (`SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`), + breaking string-exact matching in downstream retry logic. We match + stable substrings instead. + """ + + def test_bad_record_mac_classifies_as_timeout(self): + """OpenSSL 3.x mid-stream bad record mac alert.""" + e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2580)") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_openssl_3x_format_classifies_as_timeout(self): + """New format `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC` still matches + because we key on both space- and underscore-separated forms of + the stable `bad_record_mac` token.""" + e = Exception("ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC during streaming") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_tls_alert_internal_error_classifies_as_timeout(self): + e = Exception("[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + assert result.should_compress is False + + def test_ssl_handshake_failure_classifies_as_timeout(self): + e = Exception("ssl handshake failure during mid-stream") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_ssl_prefix_classifies_as_timeout(self): + """Python's generic '[SSL: XYZ]' prefix from the ssl module.""" + e = Exception("[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_ssl_alert_on_large_session_does_not_compress(self): + """Critical: SSL alerts on big contexts must NOT trigger context + compression — compression is expensive and won't fix a transport + hiccup. This is why _SSL_TRANSIENT_PATTERNS is separate from + _SERVER_DISCONNECT_PATTERNS. + """ + e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac") + result = classify_api_error( + e, + approx_tokens=180000, # 90% of a 200k-context window + context_length=200000, + num_messages=300, + ) + assert result.reason == FailoverReason.timeout + assert result.should_compress is False + + def test_plain_disconnect_on_large_session_still_compresses(self): + """Regression guard: the context-overflow-via-disconnect path + (non-SSL disconnects on large sessions) must still trigger + compression. Only SSL-specific disconnects skip it. + """ + e = Exception("Server disconnected without sending a response") + result = classify_api_error( + e, + approx_tokens=180000, + context_length=200000, + num_messages=300, + ) + assert result.reason == FailoverReason.context_overflow + assert result.should_compress is True + + def test_real_ssl_error_type_classifies_as_timeout(self): + """Real ssl.SSLError instance — the type name alone (not message) + should route to the transport bucket.""" + import ssl + e = ssl.SSLError("arbitrary ssl error") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True