mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(error_classifier): retry mid-stream SSL/TLS alert errors as transport
Mid-stream SSL alerts (bad_record_mac, tls_alert_internal_error, handshake
failures) previously fell through the classifier pipeline to the 'unknown'
bucket because:
- ssl.SSLError type names weren't in _TRANSPORT_ERROR_TYPES (the
isinstance(OSError) catch picks up some but not all SDK-wrapped forms)
- the message-pattern list had no SSL alert substrings
The 'unknown' bucket is still retryable, but: (a) logs tell the user
'unknown' instead of identifying the cause, (b) it bypasses the
transport-specific backoff/fallback logic, and (c) if the SSL error
happens on a large session with a generic 'connection closed' wrapper,
the existing disconnect-on-large-session heuristic would incorrectly
trigger context compression — expensive, and never fixes a transport
hiccup.
Changes:
- Add ssl.SSLError and its subclass type names to _TRANSPORT_ERROR_TYPES
- New _SSL_TRANSIENT_PATTERNS list (separate from _SERVER_DISCONNECT_PATTERNS
so SSL alerts route to timeout, not context_overflow+compress)
- New step 5 in the classifier pipeline: SSL pattern check runs BEFORE
the disconnect check to pre-empt the large-session-compress path
Patterns cover both space-separated ('ssl alert', 'bad record mac')
and underscore-separated ('ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC')
forms. This is load-bearing because OpenSSL 3.x changed the error-code
separator from underscore to slash (e.g. SSLV3_ALERT_BAD_RECORD_MAC →
SSL/TLS_ALERT_BAD_RECORD_MAC) and will likely churn again — matching on
stable alert reason substrings survives future format changes.
Tests (8 new):
- BAD_RECORD_MAC in Python ssl.c format
- OpenSSL 3.x underscore format
- TLSV1_ALERT_INTERNAL_ERROR
- ssl handshake failure
- [SSL: ...] prefix fallback
- Real ssl.SSLError instance
- REGRESSION GUARD: SSL on large session does NOT compress
- REGRESSION GUARD: plain disconnect on large session STILL compresses
This commit is contained in:
parent
402d048eb6
commit
b40b6ec720
2 changed files with 156 additions and 6 deletions
|
|
@ -949,3 +949,94 @@ class TestAdversarialEdgeCases:
|
|||
e = MockAPIError("server error", status_code=500, body={"message": None})
|
||||
result = classify_api_error(e)
|
||||
assert result is not None
|
||||
|
||||
|
||||
# ── Test: SSL/TLS transient errors ─────────────────────────────────────
|
||||
|
||||
class TestSSLTransientPatterns:
|
||||
"""SSL/TLS alerts mid-stream should retry as timeout, not unknown, and
|
||||
should NOT trigger context compression even on a large session.
|
||||
|
||||
Motivation: OpenSSL 3.x changed TLS alert error code format
|
||||
(`SSLV3_ALERT_BAD_RECORD_MAC` → `SSL/TLS_ALERT_BAD_RECORD_MAC`),
|
||||
breaking string-exact matching in downstream retry logic. We match
|
||||
stable substrings instead.
|
||||
"""
|
||||
|
||||
def test_bad_record_mac_classifies_as_timeout(self):
|
||||
"""OpenSSL 3.x mid-stream bad record mac alert."""
|
||||
e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac (_ssl.c:2580)")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
assert result.should_compress is False
|
||||
|
||||
def test_openssl_3x_format_classifies_as_timeout(self):
|
||||
"""New format `ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC` still matches
|
||||
because we key on both space- and underscore-separated forms of
|
||||
the stable `bad_record_mac` token."""
|
||||
e = Exception("ERR_SSL_SSL/TLS_ALERT_BAD_RECORD_MAC during streaming")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
assert result.should_compress is False
|
||||
|
||||
def test_tls_alert_internal_error_classifies_as_timeout(self):
|
||||
e = Exception("[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
assert result.should_compress is False
|
||||
|
||||
def test_ssl_handshake_failure_classifies_as_timeout(self):
|
||||
e = Exception("ssl handshake failure during mid-stream")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
def test_ssl_prefix_classifies_as_timeout(self):
|
||||
"""Python's generic '[SSL: XYZ]' prefix from the ssl module."""
|
||||
e = Exception("[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
def test_ssl_alert_on_large_session_does_not_compress(self):
|
||||
"""Critical: SSL alerts on big contexts must NOT trigger context
|
||||
compression — compression is expensive and won't fix a transport
|
||||
hiccup. This is why _SSL_TRANSIENT_PATTERNS is separate from
|
||||
_SERVER_DISCONNECT_PATTERNS.
|
||||
"""
|
||||
e = Exception("[SSL: BAD_RECORD_MAC] sslv3 alert bad record mac")
|
||||
result = classify_api_error(
|
||||
e,
|
||||
approx_tokens=180000, # 90% of a 200k-context window
|
||||
context_length=200000,
|
||||
num_messages=300,
|
||||
)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.should_compress is False
|
||||
|
||||
def test_plain_disconnect_on_large_session_still_compresses(self):
|
||||
"""Regression guard: the context-overflow-via-disconnect path
|
||||
(non-SSL disconnects on large sessions) must still trigger
|
||||
compression. Only SSL-specific disconnects skip it.
|
||||
"""
|
||||
e = Exception("Server disconnected without sending a response")
|
||||
result = classify_api_error(
|
||||
e,
|
||||
approx_tokens=180000,
|
||||
context_length=200000,
|
||||
num_messages=300,
|
||||
)
|
||||
assert result.reason == FailoverReason.context_overflow
|
||||
assert result.should_compress is True
|
||||
|
||||
def test_real_ssl_error_type_classifies_as_timeout(self):
|
||||
"""Real ssl.SSLError instance — the type name alone (not message)
|
||||
should route to the transport bucket."""
|
||||
import ssl
|
||||
e = ssl.SSLError("arbitrary ssl error")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue