From 6212e9ade8702c49b44dc22c2622e6fd9f534205 Mon Sep 17 00:00:00 2001 From: Soju Date: Fri, 15 May 2026 11:26:58 +0900 Subject: [PATCH] fix(error-classifier): treat 5xx request-validation errors as non-retryable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard OpenAI returns request-validation failures (unknown/ unsupported parameter, malformed request) as 4xx. Some OpenAI-compatible gateways return them as 5xx instead — codex.nekos.me returns 502 for an unknown parameter. The generic '5xx -> retryable server_error' rule then misfires: the error is deterministic (every retry gets the identical rejection), so the retry loop burns all 3 attempts, the transport-recovery path resets the counter and burns 3 more, and the result is a request flood against a request that can never succeed. Fix: when a 500/502 body carries an unambiguous request-validation signal — 'unknown parameter' / 'unsupported parameter' / 'invalid_request_error' in the message text, or invalid_request_error / unknown_parameter / unsupported_parameter as the structured error code — classify as a non-retryable format_error so the loop fails fast and falls back. Genuine 502 Bad Gateway with no such signal stays retryable as before. Origin: local-author Upstream-PR: none Patch-State: local-only --- agent/error_classifier.py | 35 +++++++++++++++++ tests/agent/test_error_classifier.py | 58 ++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 7fa38bbcf70..0afcf66d445 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -240,6 +240,24 @@ _MODEL_NOT_FOUND_PATTERNS = [ "unsupported model", ] +# Request-validation patterns — the request is malformed and will fail +# identically on every retry. Some OpenAI-compatible gateways (notably +# codex.nekos.me) return these as 5xx instead of the standard 4xx, which +# makes the generic "5xx → retryable server_error" rule misfire: the retry +# loop hammers the same deterministic rejection 3+ times, then the +# transport-recovery path resets the counter and does it again, producing +# a request flood. When a 5xx body carries one of these unambiguous +# request-validation signals, classify as a non-retryable format_error so +# the loop fails fast and falls back instead of looping. +_REQUEST_VALIDATION_PATTERNS = [ + "unknown parameter", + "unsupported parameter", + "unrecognized request argument", + "invalid_request_error", + "unknown_parameter", + "unsupported_parameter", +] + # OpenRouter aggregator policy-block patterns. # # When a user's OpenRouter account privacy setting (or a per-request @@ -745,6 +763,23 @@ def _classify_by_status( ) if status_code in {500, 502}: + # Some OpenAI-compatible gateways return request-validation errors + # with a 5xx status (codex.nekos.me returns 502 for unknown/ + # unsupported parameters). These are deterministic — every retry + # gets the identical rejection — so the generic "5xx → retryable + # server_error" rule turns one bad request into a retry flood. + # Detect the unambiguous request-validation signals (in either the + # message text or the structured error code) and fail fast. + if ( + any(p in error_msg for p in _REQUEST_VALIDATION_PATTERNS) + or error_code.lower() in {"invalid_request_error", "unknown_parameter", + "unsupported_parameter"} + ): + return result_fn( + FailoverReason.format_error, + retryable=False, + should_fallback=True, + ) return result_fn(FailoverReason.server_error, retryable=True) if status_code in {503, 529}: diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index eef3650347c..397d2673552 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -293,6 +293,64 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.overloaded + # ── 5xx that are actually request-validation errors ── + # Some OpenAI-compatible gateways (e.g. codex.nekos.me) return + # request-validation failures with a 5xx status. These are + # deterministic, so they must NOT be retried — otherwise the retry + # loop hammers the identical bad request into a flood. + + def test_502_with_unknown_parameter_is_non_retryable(self): + e = MockAPIError( + "Unknown parameter: 'input[617]._empty_recovery_synthetic'", + status_code=502, + body={ + "error": { + "type": "invalid_request_error", + "message": ( + "[ObjectParam] [input[617]._empty_recovery_synthetic] " + "[unknown_parameter] Unknown parameter: " + "'input[617]._empty_recovery_synthetic'." + ), + } + }, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.format_error + assert result.retryable is False + assert result.should_fallback is True + + def test_502_with_unsupported_parameter_is_non_retryable(self): + e = MockAPIError( + "Unsupported parameter: logprobs", + status_code=502, + body={ + "error": { + "type": "invalid_request_error", + "message": "Unsupported parameter: logprobs", + } + }, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.format_error + assert result.retryable is False + + def test_500_with_invalid_request_error_type_is_non_retryable(self): + e = MockAPIError( + "bad request", + status_code=500, + body={"error": {"type": "invalid_request_error", "message": "bad request"}}, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.format_error + assert result.retryable is False + + def test_502_plain_bad_gateway_still_retryable(self): + """A genuine 502 with no request-validation signal stays retryable.""" + e = MockAPIError("Bad Gateway", status_code=502) + result = classify_api_error(e) + assert result.reason == FailoverReason.server_error + assert result.retryable is True + # ── Model not found ── def test_404_model_not_found(self):