mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(error_classifier): retry mid-stream SSL/TLS alert errors as transport
Mid-stream SSL alerts (bad_record_mac, tls_alert_internal_error, handshake
failures) previously fell through the classifier pipeline to the 'unknown'
bucket because:
- ssl.SSLError type names weren't in _TRANSPORT_ERROR_TYPES (the
isinstance(OSError) catch picks up some but not all SDK-wrapped forms)
- the message-pattern list had no SSL alert substrings
The 'unknown' bucket is still retryable, but: (a) logs tell the user
'unknown' instead of identifying the cause, (b) it bypasses the
transport-specific backoff/fallback logic, and (c) if the SSL error
happens on a large session with a generic 'connection closed' wrapper,
the existing disconnect-on-large-session heuristic would incorrectly
trigger context compression — expensive, and never fixes a transport
hiccup.
Changes:
- Add ssl.SSLError and its subclass type names to _TRANSPORT_ERROR_TYPES
- New _SSL_TRANSIENT_PATTERNS list (separate from _SERVER_DISCONNECT_PATTERNS
so SSL alerts route to timeout, not context_overflow+compress)
- New step 5 in the classifier pipeline: SSL pattern check runs BEFORE
the disconnect check to pre-empt the large-session-compress path
Patterns cover both space-separated ('ssl alert', 'bad record mac')
and underscore-separated ('ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC')
forms. This is load-bearing because OpenSSL 3.x changed the error-code
separator from underscore to slash (e.g. SSLV3_ALERT_BAD_RECORD_MAC →
SSL/TLS_ALERT_BAD_RECORD_MAC) and will likely churn again — matching on
stable alert reason substrings survives future format changes.
Tests (8 new):
- BAD_RECORD_MAC in Python ssl.c format
- OpenSSL 3.x underscore format
- TLSV1_ALERT_INTERNAL_ERROR
- ssl handshake failure
- [SSL: ...] prefix fallback
- Real ssl.SSLError instance
- REGRESSION GUARD: SSL on large session does NOT compress
- REGRESSION GUARD: plain disconnect on large session STILL compresses
This commit is contained in:
parent
402d048eb6
commit
b40b6ec720
2 changed files with 156 additions and 6 deletions
|
|
@ -220,12 +220,25 @@ _TRANSPORT_ERROR_TYPES = frozenset({
|
||||||
"ConnectionAbortedError", "BrokenPipeError",
|
"ConnectionAbortedError", "BrokenPipeError",
|
||||||
"TimeoutError", "ReadError",
|
"TimeoutError", "ReadError",
|
||||||
"ServerDisconnectedError",
|
"ServerDisconnectedError",
|
||||||
|
# SSL/TLS transport errors — transient mid-stream handshake/record
|
||||||
|
# failures that should retry rather than surface as a stalled session.
|
||||||
|
# ssl.SSLError subclasses OSError (caught by isinstance) but we list
|
||||||
|
# the type names here so provider-wrapped SSL errors (e.g. when the
|
||||||
|
# SDK re-raises without preserving the exception chain) still classify
|
||||||
|
# as transport rather than falling through to the unknown bucket.
|
||||||
|
"SSLError", "SSLZeroReturnError", "SSLWantReadError",
|
||||||
|
"SSLWantWriteError", "SSLEOFError", "SSLSyscallError",
|
||||||
# OpenAI SDK errors (not subclasses of Python builtins)
|
# OpenAI SDK errors (not subclasses of Python builtins)
|
||||||
"APIConnectionError",
|
"APIConnectionError",
|
||||||
"APITimeoutError",
|
"APITimeoutError",
|
||||||
})
|
})
|
||||||
|
|
||||||
# Server disconnect patterns (no status code, but transport-level)
|
# Server disconnect patterns (no status code, but transport-level).
|
||||||
|
# These are the "ambiguous" patterns — a plain connection close could be
|
||||||
|
# transient transport hiccup OR server-side context overflow rejection
|
||||||
|
# (common when the API gateway disconnects instead of returning an HTTP
|
||||||
|
# error for oversized requests). A large session + one of these patterns
|
||||||
|
# triggers the context-overflow-with-compression recovery path.
|
||||||
_SERVER_DISCONNECT_PATTERNS = [
|
_SERVER_DISCONNECT_PATTERNS = [
|
||||||
"server disconnected",
|
"server disconnected",
|
||||||
"peer closed connection",
|
"peer closed connection",
|
||||||
|
|
@ -236,6 +249,40 @@ _SERVER_DISCONNECT_PATTERNS = [
|
||||||
"incomplete chunked read",
|
"incomplete chunked read",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# SSL/TLS transient failure patterns — intentionally distinct from
|
||||||
|
# _SERVER_DISCONNECT_PATTERNS above.
|
||||||
|
#
|
||||||
|
# An SSL alert mid-stream is almost always a transport-layer hiccup
|
||||||
|
# (flaky network, mid-session TLS renegotiation failure, load balancer
|
||||||
|
# dropping the connection) — NOT a server-side context overflow signal.
|
||||||
|
# So we want the retry path but NOT the compression path; lumping these
|
||||||
|
# into _SERVER_DISCONNECT_PATTERNS would trigger unnecessary (and
|
||||||
|
# expensive) context compression on any large-session SSL hiccup.
|
||||||
|
#
|
||||||
|
# The OpenSSL library constructs error codes by prepending a format string
|
||||||
|
# to the uppercased alert reason; OpenSSL 3.x changed the separator
|
||||||
|
# (e.g. `SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`),
|
||||||
|
# which silently stopped matching anything explicit. Matching on the
|
||||||
|
# stable substrings (`bad record mac`, `ssl alert`, `tls alert`, etc.)
|
||||||
|
# survives future OpenSSL format churn without code changes.
|
||||||
|
_SSL_TRANSIENT_PATTERNS = [
|
||||||
|
# Space-separated (human-readable form, Python ssl module, most SDKs)
|
||||||
|
"bad record mac",
|
||||||
|
"ssl alert",
|
||||||
|
"tls alert",
|
||||||
|
"ssl handshake failure",
|
||||||
|
"tlsv1 alert",
|
||||||
|
"sslv3 alert",
|
||||||
|
# Underscore-separated (OpenSSL error code tokens, e.g.
|
||||||
|
# `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC`, `SSLV3_ALERT_BAD_RECORD_MAC`)
|
||||||
|
"bad_record_mac",
|
||||||
|
"ssl_alert",
|
||||||
|
"tls_alert",
|
||||||
|
"tls_alert_internal_error",
|
||||||
|
# Python ssl module prefix, e.g. "[SSL: BAD_RECORD_MAC]"
|
||||||
|
"[ssl:",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# ── Classification pipeline ─────────────────────────────────────────────
|
# ── Classification pipeline ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
@ -255,9 +302,10 @@ def classify_api_error(
|
||||||
2. HTTP status code + message-aware refinement
|
2. HTTP status code + message-aware refinement
|
||||||
3. Error code classification (from body)
|
3. Error code classification (from body)
|
||||||
4. Message pattern matching (billing vs rate_limit vs context vs auth)
|
4. Message pattern matching (billing vs rate_limit vs context vs auth)
|
||||||
5. Transport error heuristics
|
5. SSL/TLS transient alert patterns → retry as timeout
|
||||||
6. Server disconnect + large session → context overflow
|
6. Server disconnect + large session → context overflow
|
||||||
7. Fallback: unknown (retryable with backoff)
|
7. Transport error heuristics
|
||||||
|
8. Fallback: unknown (retryable with backoff)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
error: The exception from the API call.
|
error: The exception from the API call.
|
||||||
|
|
@ -388,7 +436,18 @@ def classify_api_error(
|
||||||
if classified is not None:
|
if classified is not None:
|
||||||
return classified
|
return classified
|
||||||
|
|
||||||
# ── 5. Server disconnect + large session → context overflow ─────
|
# ── 5. SSL/TLS transient errors → retry as timeout (not compression) ──
|
||||||
|
# SSL alerts mid-stream are transport hiccups, not server-side context
|
||||||
|
# overflow signals. Classify before the disconnect check so a large
|
||||||
|
# session doesn't incorrectly trigger context compression when the real
|
||||||
|
# cause is a flaky TLS handshake. Also matches when the error is
|
||||||
|
# wrapped in a generic exception whose message string carries the SSL
|
||||||
|
# alert text but the type isn't ssl.SSLError (happens with some SDKs
|
||||||
|
# that re-raise without chaining).
|
||||||
|
if any(p in error_msg for p in _SSL_TRANSIENT_PATTERNS):
|
||||||
|
return _result(FailoverReason.timeout, retryable=True)
|
||||||
|
|
||||||
|
# ── 6. Server disconnect + large session → context overflow ─────
|
||||||
# Must come BEFORE generic transport error catch — a disconnect on
|
# Must come BEFORE generic transport error catch — a disconnect on
|
||||||
# a large session is more likely context overflow than a transient
|
# a large session is more likely context overflow than a transient
|
||||||
# transport hiccup. Without this ordering, RemoteProtocolError
|
# transport hiccup. Without this ordering, RemoteProtocolError
|
||||||
|
|
@ -405,12 +464,12 @@ def classify_api_error(
|
||||||
)
|
)
|
||||||
return _result(FailoverReason.timeout, retryable=True)
|
return _result(FailoverReason.timeout, retryable=True)
|
||||||
|
|
||||||
# ── 6. Transport / timeout heuristics ───────────────────────────
|
# ── 7. Transport / timeout heuristics ───────────────────────────
|
||||||
|
|
||||||
if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)):
|
if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)):
|
||||||
return _result(FailoverReason.timeout, retryable=True)
|
return _result(FailoverReason.timeout, retryable=True)
|
||||||
|
|
||||||
# ── 7. Fallback: unknown ────────────────────────────────────────
|
# ── 8. Fallback: unknown ────────────────────────────────────────
|
||||||
|
|
||||||
return _result(FailoverReason.unknown, retryable=True)
|
return _result(FailoverReason.unknown, retryable=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -949,3 +949,94 @@ class TestAdversarialEdgeCases:
|
||||||
e = MockAPIError("server error", status_code=500, body={"message": None})
|
e = MockAPIError("server error", status_code=500, body={"message": None})
|
||||||
result = classify_api_error(e)
|
result = classify_api_error(e)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Test: SSL/TLS transient errors ─────────────────────────────────────
|
||||||
|
|
||||||
|
class TestSSLTransientPatterns:
|
||||||
|
"""SSL/TLS alerts mid-stream should retry as timeout, not unknown, and
|
||||||
|
should NOT trigger context compression even on a large session.
|
||||||
|
|
||||||
|
Motivation: OpenSSL 3.x changed TLS alert error code format
|
||||||
|
(`SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`),
|
||||||
|
breaking string-exact matching in downstream retry logic. We match
|
||||||
|
stable substrings instead.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_bad_record_mac_classifies_as_timeout(self):
|
||||||
|
"""OpenSSL 3.x mid-stream bad record mac alert."""
|
||||||
|
e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2580)")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
assert result.should_compress is False
|
||||||
|
|
||||||
|
def test_openssl_3x_format_classifies_as_timeout(self):
|
||||||
|
"""New format `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC` still matches
|
||||||
|
because we key on both space- and underscore-separated forms of
|
||||||
|
the stable `bad_record_mac` token."""
|
||||||
|
e = Exception("ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC during streaming")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
assert result.should_compress is False
|
||||||
|
|
||||||
|
def test_tls_alert_internal_error_classifies_as_timeout(self):
|
||||||
|
e = Exception("[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
assert result.should_compress is False
|
||||||
|
|
||||||
|
def test_ssl_handshake_failure_classifies_as_timeout(self):
|
||||||
|
e = Exception("ssl handshake failure during mid-stream")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
|
||||||
|
def test_ssl_prefix_classifies_as_timeout(self):
|
||||||
|
"""Python's generic '[SSL: XYZ]' prefix from the ssl module."""
|
||||||
|
e = Exception("[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
|
||||||
|
def test_ssl_alert_on_large_session_does_not_compress(self):
|
||||||
|
"""Critical: SSL alerts on big contexts must NOT trigger context
|
||||||
|
compression — compression is expensive and won't fix a transport
|
||||||
|
hiccup. This is why _SSL_TRANSIENT_PATTERNS is separate from
|
||||||
|
_SERVER_DISCONNECT_PATTERNS.
|
||||||
|
"""
|
||||||
|
e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac")
|
||||||
|
result = classify_api_error(
|
||||||
|
e,
|
||||||
|
approx_tokens=180000, # 90% of a 200k-context window
|
||||||
|
context_length=200000,
|
||||||
|
num_messages=300,
|
||||||
|
)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.should_compress is False
|
||||||
|
|
||||||
|
def test_plain_disconnect_on_large_session_still_compresses(self):
|
||||||
|
"""Regression guard: the context-overflow-via-disconnect path
|
||||||
|
(non-SSL disconnects on large sessions) must still trigger
|
||||||
|
compression. Only SSL-specific disconnects skip it.
|
||||||
|
"""
|
||||||
|
e = Exception("Server disconnected without sending a response")
|
||||||
|
result = classify_api_error(
|
||||||
|
e,
|
||||||
|
approx_tokens=180000,
|
||||||
|
context_length=200000,
|
||||||
|
num_messages=300,
|
||||||
|
)
|
||||||
|
assert result.reason == FailoverReason.context_overflow
|
||||||
|
assert result.should_compress is True
|
||||||
|
|
||||||
|
def test_real_ssl_error_type_classifies_as_timeout(self):
|
||||||
|
"""Real ssl.SSLError instance — the type name alone (not message)
|
||||||
|
should route to the transport bucket."""
|
||||||
|
import ssl
|
||||||
|
e = ssl.SSLError("arbitrary ssl error")
|
||||||
|
result = classify_api_error(e)
|
||||||
|
assert result.reason == FailoverReason.timeout
|
||||||
|
assert result.retryable is True
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue