fix(error_classifier): classify xAI Grok entitlement SSE errors as auth

When xAI returns a subscription/entitlement error through an SSE
``type=error`` frame, ``_StreamErrorEvent`` is raised with
``status_code=None``.  This caused ``_classify_by_status`` (step 2 of
``classify_api_error``) to be skipped entirely, and the Grok-specific
phrases ("do not have an active Grok subscription", "out of available
resources") appeared in none of the message-pattern lists.  The error
fell through to ``FailoverReason.unknown (retryable=True)``, burning
``max_retries`` on every affected X Premium+ / SuperGrok user before
the agent stopped — and ``_is_entitlement_failure`` was never called
because it only fires under ``FailoverReason.auth``.

The HTTP 403 path already handled this correctly (``_classify_by_status``
returns ``auth/non-retryable`` for 403).  Add an explicit pattern block
at step 1 (highest priority, before the ``status_code`` guard) so both
code paths route to ``FailoverReason.auth, retryable=False,
should_fallback=True`` — matching the 403 path exactly.

Add three regression tests in ``Fix D`` section of
``test_codex_xai_oauth_recovery.py``:
- primary "do not have an active Grok subscription" phrase
- "out of available resources" + "grok" variant
- unrelated ``_StreamErrorEvent`` must not be reclassified
This commit is contained in:
EloquentBrush0x 2026-05-17 12:02:36 +03:00 committed by Teknium
parent bc77f79798
commit 1fabd6e100
2 changed files with 85 additions and 0 deletions

View file

@ -510,6 +510,35 @@ def classify_api_error(
should_compress=False,
)
# xAI Grok subscription entitlement errors.
#
# xAI returns "You have either run out of available resources or do not
# have an active Grok subscription" through two distinct code paths:
#
# • HTTP 403 — status_code is set; _classify_by_status (step 2) routes
# it to FailoverReason.auth correctly, and _is_entitlement_failure
# then prevents the credential-refresh loop.
#
# • SSE ``type=error`` frame — surfaced as _StreamErrorEvent with
# status_code=None. _classify_by_status is skipped entirely, and
# "grok subscription" / "out of available resources" appear in none
# of the message-pattern lists below. Without this guard the error
# falls through to FailoverReason.unknown (retryable=True), burning
# max_retries before the agent stops — and _is_entitlement_failure
# is never called because it only runs under FailoverReason.auth.
#
# Both X Premium+ and SuperGrok subscribers hit this path when their
# subscription tier does not cover the requested model or feature.
if (
"do not have an active grok subscription" in error_msg
or ("out of available resources" in error_msg and "grok" in error_msg)
):
return _result(
FailoverReason.auth,
retryable=False,
should_fallback=True,
)
# ── 2. HTTP status code classification ──────────────────────────
if status_code is not None:

View file

@ -224,6 +224,62 @@ def test_summarize_api_error_passes_through_unrelated_errors():
assert "upstream is sad" in summary
# ---------------------------------------------------------------------------
# Fix D: _StreamErrorEvent xAI entitlement classified as auth, not retryable
#
# run_codex_create_stream_fallback raises _StreamErrorEvent (status_code=None)
# when the Responses stream emits a ``type=error`` SSE frame. Before this
# fix, classify_api_error had no match for "grok subscription" in its pattern
# lists, so it returned FailoverReason.unknown (retryable=True) — burning
# max_retries before the agent stopped. _is_entitlement_failure was never
# called because it only runs when FailoverReason.auth is returned.
# ---------------------------------------------------------------------------
def test_classify_api_error_stream_event_grok_subscription_is_auth():
"""_StreamErrorEvent with xAI subscription message classifies as auth/non-retryable.
The SSE error path has status_code=None, so _classify_by_status is
skipped. The explicit pattern added at step 1 must fire first and
return auth/non-retryable so _is_entitlement_failure can stop the loop.
"""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent(
"You have either run out of available resources or do not have an "
"active Grok subscription. Manage subscriptions at https://grok.com",
code="The caller does not have permission to execute the specified operation",
)
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason == FailoverReason.auth
assert result.retryable is False
assert result.should_fallback is True
def test_classify_api_error_stream_event_resources_exhausted_grok_is_auth():
"""'out of available resources' + 'grok' variant also classifies as auth."""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent(
"You have run out of available resources for Grok.",
)
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason == FailoverReason.auth
assert result.retryable is False
def test_classify_api_error_stream_event_unrelated_not_reclassified():
"""An unrelated _StreamErrorEvent must not be caught by the xAI guard."""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent("Internal server error — try again later")
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason != FailoverReason.auth
# ---------------------------------------------------------------------------
# Fix C: reasoning replay gating for xai-oauth
# ---------------------------------------------------------------------------