mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-31 06:51:29 +00:00
Eleven new tests pinning the #29344 fix. Layout mirrors the existing "Fix D" entitlement section so the bad-credentials disambiguator sits alongside the entitlement-block tests it complements. Classifier-level coverage: * ``test_is_entitlement_failure_false_for_bad_credentials_wke_suffix`` — verbatim shape from the reporter's wire capture (``{code: 'caller does not have permission', error: 'OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]'}``) ↦ classifier must return False so the refresh path runs. * ``test_is_entitlement_failure_false_for_wke_suffix_in_normalized_shape`` — same body after ``_extract_api_error_context`` has rewritten it to ``{reason, message}``. The disambiguator must fire in BOTH shapes; without this guard the production call site at ``_recover_with_credential_pool`` (which goes through the normalised extractor) would still misclassify. * ``test_is_entitlement_failure_false_for_any_wke_unauthenticated_variant`` — parametrised forward-compat: ``bad-credentials``, ``expired-token``, ``revoked``, ``some-future-reason``. xAI documents the prefix as stable, the suffix after the colon as a reason code that can grow; every variant under ``unauthenticated:`` must route to refresh. * ``test_is_entitlement_failure_false_via_oauth2_validation_phrase_alone`` — belt-and-braces guard: if a future API revision drops the WKE suffix but keeps "OAuth2 access token could not be validated", we still classify correctly. * ``test_is_entitlement_failure_wke_signal_overrides_entitlement_keywords`` — defensive: if a body ever carries BOTH the WKE suffix and entitlement language, the WKE signal wins. Auth is recoverable; entitlement isn't, and a refreshed token will resurface the entitlement message on the next request. * ``test_is_entitlement_failure_case_insensitive_wke_match`` — pins that the classifier lowercases the haystack so a future xAI build that uppercases the prefix doesn't reintroduce the bug. Recovery-path coverage (end-to-end through ``_recover_with_credential_pool``): * ``test_recover_with_credential_pool_refreshes_on_xai_bad_credentials_403`` — the headline test the reporter requested: a bad-credentials 403 with the exact wire body must call ``try_refresh_current()`` exactly once and ``_swap_credential`` once. Pre-fix this returned ``(False, _)`` because the entitlement classifier over-matched and short-circuited the refresh path. * ``test_recover_with_credential_pool_still_blocks_real_entitlement`` — companion regression guard for #26847: a pure unsubscribed- account body (no WKE suffix, no OAuth2-validation phrase) must still surface as entitlement and skip refresh. The new disambiguator must not weaken the original loop-protection it was added to preserve. The scaffolding reuses ``_make_codex_agent``, ``_FakePool``, and the existing ``MagicMock`` patterns from the surrounding tests so the new section reads as a natural extension of "Fix D" rather than a separate test file.
906 lines
34 KiB
Python
906 lines
34 KiB
Python
"""Regression tests for the May 2026 xAI OAuth (SuperGrok / X Premium) bugs.
|
|
|
|
Three distinct failure modes the user community hit during rollout:
|
|
|
|
1. ``RuntimeError("Expected to have received `response.created` before
|
|
`error`")`` on multi-turn xAI OAuth conversations. The OpenAI SDK's
|
|
Responses streaming state machine collapses an upstream ``error`` SSE
|
|
frame into a generic stream-ordering error. ``_run_codex_stream``
|
|
now treats this the same way it already treats the missing
|
|
``response.completed`` postlude — fall back to a non-stream
|
|
``responses.create(stream=True)`` which surfaces the real provider
|
|
error. Also closes #8133 (``response.in_progress`` prelude on custom
|
|
relays) and #14634 (``codex.rate_limits`` prelude on codex-lb).
|
|
|
|
2. The HTTP 403 entitlement error xAI returns when an OAuth token lacks
|
|
SuperGrok / X Premium ("You have either run out of available
|
|
resources or do not have an active Grok subscription") used to read
|
|
as a confusing wall of JSON. ``_summarize_api_error`` now appends a
|
|
one-line hint pointing the user at https://grok.com and ``/model``.
|
|
|
|
3. Multi-turn replay of ``codex_reasoning_items`` (with
|
|
``encrypted_content``) was briefly suppressed for ``is_xai_responses``
|
|
in PR #26644 on the theory that xAI's OAuth/SuperGrok surface
|
|
rejected replayed encrypted reasoning items. That suppression was
|
|
reverted shortly after: xAI confirmed they explicitly want Hermes to
|
|
thread encrypted reasoning back across turns, and the original
|
|
multi-turn failure mode was actually the prelude-SSE issue closed by
|
|
Fix A above. The remaining tests here lock in that xAI receives
|
|
replayed reasoning AND that we ask xAI to echo it back in the
|
|
``include`` array.
|
|
"""
|
|
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix A: prelude error fallback
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_codex_agent():
|
|
"""Build a minimal AIAgent wired for codex_responses streaming tests."""
|
|
from run_agent import AIAgent
|
|
|
|
agent = AIAgent(
|
|
api_key="test-key",
|
|
base_url="https://api.x.ai/v1",
|
|
model="grok-4.3",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
agent.api_mode = "codex_responses"
|
|
agent.provider = "xai-oauth"
|
|
agent._interrupt_requested = False
|
|
return agent
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"prelude_event_type",
|
|
[
|
|
"error", # xAI OAuth multi-turn
|
|
"codex.rate_limits", # codex-lb relays (#14634)
|
|
"response.in_progress", # custom Responses relays (#8133)
|
|
],
|
|
)
|
|
def test_codex_stream_prelude_error_falls_back_to_create_stream(prelude_event_type):
|
|
"""The SDK's prelude RuntimeError must trigger the non-stream fallback.
|
|
|
|
When the first SSE event isn't ``response.created``, openai-python
|
|
raises RuntimeError before our event loop sees anything. We must
|
|
detect that, retry once, then fall back to ``create(stream=True)``
|
|
which surfaces the real provider error or a real response.
|
|
"""
|
|
agent = _make_codex_agent()
|
|
|
|
prelude_error = RuntimeError(
|
|
f"Expected to have received `response.created` before `{prelude_event_type}`"
|
|
)
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.stream.side_effect = prelude_error
|
|
|
|
fallback_response = SimpleNamespace(
|
|
output=[SimpleNamespace(
|
|
type="message",
|
|
content=[SimpleNamespace(type="output_text", text="fallback ok")],
|
|
)],
|
|
status="completed",
|
|
)
|
|
|
|
with patch.object(
|
|
agent, "_run_codex_create_stream_fallback", return_value=fallback_response
|
|
) as mock_fallback:
|
|
result = agent._run_codex_stream({}, client=mock_client)
|
|
|
|
assert result is fallback_response
|
|
mock_fallback.assert_called_once_with({}, client=mock_client)
|
|
|
|
|
|
def test_codex_stream_prelude_error_retries_once_before_fallback():
|
|
"""The retry path must fire one extra stream attempt before falling back."""
|
|
agent = _make_codex_agent()
|
|
|
|
call_count = {"n": 0}
|
|
|
|
def stream_side_effect(**kwargs):
|
|
call_count["n"] += 1
|
|
raise RuntimeError(
|
|
"Expected to have received `response.created` before `error`"
|
|
)
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.stream.side_effect = stream_side_effect
|
|
|
|
fallback_response = SimpleNamespace(output=[], status="completed")
|
|
with patch.object(
|
|
agent, "_run_codex_create_stream_fallback", return_value=fallback_response
|
|
) as mock_fallback:
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
# max_stream_retries=1 → one retry + final attempt → 2 stream calls,
|
|
# THEN the fallback path runs.
|
|
assert call_count["n"] == 2
|
|
mock_fallback.assert_called_once()
|
|
|
|
|
|
def test_codex_stream_unrelated_runtimeerror_still_raises():
|
|
"""RuntimeErrors that aren't prelude/postlude shape must propagate."""
|
|
agent = _make_codex_agent()
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.stream.side_effect = RuntimeError("something else broke")
|
|
|
|
with patch.object(agent, "_run_codex_create_stream_fallback") as mock_fallback:
|
|
with pytest.raises(RuntimeError, match="something else broke"):
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
mock_fallback.assert_not_called()
|
|
|
|
|
|
def test_codex_stream_postlude_error_still_falls_back():
|
|
"""Existing ``response.completed`` fallback must not regress."""
|
|
agent = _make_codex_agent()
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.stream.side_effect = RuntimeError(
|
|
"Didn't receive a `response.completed` event."
|
|
)
|
|
|
|
fallback_response = SimpleNamespace(output=[], status="completed")
|
|
with patch.object(
|
|
agent, "_run_codex_create_stream_fallback", return_value=fallback_response
|
|
) as mock_fallback:
|
|
result = agent._run_codex_stream({}, client=mock_client)
|
|
|
|
assert result is fallback_response
|
|
mock_fallback.assert_called_once()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix B: surface xAI's entitlement body verbatim (no editorializing)
|
|
#
|
|
# The original PR #26644 appended a hint that led with "X Premium+ does NOT
|
|
# include xAI API access — only standalone SuperGrok subscribers can use this
|
|
# provider." xAI announced on 2026-05-16 that X Premium subs now work in
|
|
# Hermes (https://x.ai/news/grok-hermes), making that hint actively wrong:
|
|
# a Premium+ user hitting a real entitlement issue (no Grok sub, wrong tier,
|
|
# exhausted quota) would be misdirected to switch subscriptions when their
|
|
# Premium sub is in fact valid. We now surface xAI's own body text verbatim
|
|
# (which already says "Manage subscriptions at https://grok.com/?_s=usage")
|
|
# and leave the diagnosis to xAI's wording.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_summarize_api_error_surfaces_xai_entitlement_body_verbatim():
|
|
"""xAI's OAuth 403 body must surface as-is, with no Hermes-side hint."""
|
|
from run_agent import AIAgent
|
|
|
|
error = RuntimeError(
|
|
"HTTP 403: Error code: 403 - {'code': 'The caller does not have permission "
|
|
"to execute the specified operation', 'error': 'You have either run out of "
|
|
"available resources or do not have an active Grok subscription. Manage "
|
|
"subscriptions at https://grok.com'}"
|
|
)
|
|
summary = AIAgent._summarize_api_error(error)
|
|
# xAI's own body text must reach the user — they need it to diagnose.
|
|
assert "do not have an active Grok subscription" in summary
|
|
# No stale claim that X Premium is incompatible with Hermes.
|
|
assert "X Premium+ does NOT include" not in summary
|
|
assert "standalone SuperGrok subscribers" not in summary
|
|
|
|
|
|
def test_summarize_api_error_xai_body_message_unwrapped():
|
|
"""SDK-style error with structured body surfaces the message cleanly."""
|
|
from run_agent import AIAgent
|
|
|
|
class _XaiErr(Exception):
|
|
status_code = 403
|
|
body = {
|
|
"error": {
|
|
"message": (
|
|
"You have either run out of available resources or do "
|
|
"not have an active Grok subscription. Manage at "
|
|
"https://grok.com"
|
|
)
|
|
}
|
|
}
|
|
|
|
summary = AIAgent._summarize_api_error(_XaiErr("403"))
|
|
assert "HTTP 403" in summary
|
|
assert "do not have an active Grok subscription" in summary
|
|
# No editorializing on top of xAI's own wording.
|
|
assert "X Premium+ does NOT include" not in summary
|
|
|
|
|
|
def test_summarize_api_error_passes_through_unrelated_errors():
|
|
"""Non-xAI / non-entitlement errors must not be touched."""
|
|
from run_agent import AIAgent
|
|
|
|
error = RuntimeError("HTTP 500: upstream is sad")
|
|
summary = AIAgent._summarize_api_error(error)
|
|
assert "SuperGrok" not in summary
|
|
assert "grok.com" not in summary
|
|
assert "upstream is sad" in summary
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D: _StreamErrorEvent xAI entitlement classified as auth, not retryable
|
|
#
|
|
# run_codex_create_stream_fallback raises _StreamErrorEvent (status_code=None)
|
|
# when the Responses stream emits a ``type=error`` SSE frame. Before this
|
|
# fix, classify_api_error had no match for "grok subscription" in its pattern
|
|
# lists, so it returned FailoverReason.unknown (retryable=True) — burning
|
|
# max_retries before the agent stopped. _is_entitlement_failure was never
|
|
# called because it only runs when FailoverReason.auth is returned.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_classify_api_error_stream_event_grok_subscription_is_auth():
|
|
"""_StreamErrorEvent with xAI subscription message classifies as auth/non-retryable.
|
|
|
|
The SSE error path has status_code=None, so _classify_by_status is
|
|
skipped. The explicit pattern added at step 1 must fire first and
|
|
return auth/non-retryable so _is_entitlement_failure can stop the loop.
|
|
"""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent(
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage subscriptions at https://grok.com",
|
|
code="The caller does not have permission to execute the specified operation",
|
|
)
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason == FailoverReason.auth
|
|
assert result.retryable is False
|
|
assert result.should_fallback is True
|
|
|
|
|
|
def test_classify_api_error_stream_event_resources_exhausted_grok_is_auth():
|
|
"""'out of available resources' + 'grok' variant also classifies as auth."""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent(
|
|
"You have run out of available resources for Grok.",
|
|
)
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason == FailoverReason.auth
|
|
assert result.retryable is False
|
|
|
|
|
|
def test_classify_api_error_stream_event_unrelated_not_reclassified():
|
|
"""An unrelated _StreamErrorEvent must not be caught by the xAI guard."""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent("Internal server error — try again later")
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason != FailoverReason.auth
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix C: reasoning replay gating for xai-oauth
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _assistant_msg_with_encrypted_reasoning(text="hi from grok", encrypted="enc_blob"):
|
|
return {
|
|
"role": "assistant",
|
|
"content": text,
|
|
"codex_reasoning_items": [
|
|
{
|
|
"type": "reasoning",
|
|
"id": "rs_xai_001",
|
|
"encrypted_content": encrypted,
|
|
"summary": [],
|
|
}
|
|
],
|
|
}
|
|
|
|
|
|
def test_codex_reasoning_replay_default_includes_encrypted_content():
|
|
"""Native Codex backend (default) must still replay encrypted reasoning."""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(),
|
|
{"role": "user", "content": "what's your name?"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(msgs)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1
|
|
assert reasoning[0]["encrypted_content"] == "enc_blob"
|
|
|
|
|
|
def test_codex_reasoning_replay_includes_encrypted_content_for_xai():
|
|
"""xAI must receive replayed encrypted reasoning items (May 2026 reversal).
|
|
|
|
Earlier we stripped these on the theory that the OAuth/SuperGrok
|
|
surface rejected them. xAI subsequently confirmed they explicitly
|
|
want Hermes to thread encrypted reasoning back across turns for
|
|
cross-turn coherence — that's the whole point of the partnership
|
|
integration.
|
|
"""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(),
|
|
{"role": "user", "content": "what's your name?"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(msgs, is_xai_responses=True)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1, (
|
|
"xAI must receive replayed reasoning items — see docstring for the "
|
|
"May 2026 reversal of the earlier suppression gate."
|
|
)
|
|
assert reasoning[0]["encrypted_content"] == "enc_blob"
|
|
|
|
# And the assistant's visible text must still be present alongside it.
|
|
assistant_items = [
|
|
it for it in items
|
|
if it.get("role") == "assistant" or it.get("type") == "message"
|
|
]
|
|
assert assistant_items, "assistant message must still be present"
|
|
|
|
|
|
def test_codex_transport_xai_request_includes_encrypted_content():
|
|
"""xAI ``include`` array must request ``reasoning.encrypted_content``.
|
|
|
|
This is the request-side half of the May 2026 reversal: we ask xAI
|
|
to echo back encrypted reasoning so the next turn can replay it.
|
|
"""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="grok-4.3",
|
|
messages=[
|
|
{"role": "system", "content": "you are a helpful assistant"},
|
|
{"role": "user", "content": "hi"},
|
|
],
|
|
tools=None,
|
|
instructions="you are a helpful assistant",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=True,
|
|
)
|
|
assert kwargs["include"] == ["reasoning.encrypted_content"]
|
|
|
|
|
|
def test_codex_transport_xai_replays_reasoning_in_input():
|
|
"""End-to-end: build_kwargs on xAI must replay prior encrypted reasoning."""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="grok-4.3",
|
|
messages=[
|
|
{"role": "system", "content": "sys"},
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(text="hi from grok"),
|
|
{"role": "user", "content": "what's your name?"},
|
|
],
|
|
tools=None,
|
|
instructions="sys",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=True,
|
|
)
|
|
input_items = kwargs["input"]
|
|
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
|
|
assert len(reasoning_items) == 1
|
|
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
|
|
|
|
|
|
def test_codex_transport_native_codex_still_replays_reasoning_in_input():
|
|
"""Regression guard: openai-codex must keep the existing replay path."""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="gpt-5-codex",
|
|
messages=[
|
|
{"role": "system", "content": "sys"},
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(text="hi from codex"),
|
|
{"role": "user", "content": "next"},
|
|
],
|
|
tools=None,
|
|
instructions="sys",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=False,
|
|
)
|
|
input_items = kwargs["input"]
|
|
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
|
|
assert len(reasoning_items) == 1
|
|
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
|
|
# Native Codex still asks for encrypted_content back.
|
|
assert "reasoning.encrypted_content" in kwargs.get("include", [])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D: entitlement 403 must NOT trigger credential-pool refresh loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"message",
|
|
[
|
|
# The exact wire text RaidenTyler and Don Piedro captured.
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com",
|
|
# Permission-style variant from the same 403 body.
|
|
"The caller does not have permission to execute the specified "
|
|
"operation for grok-4.3",
|
|
],
|
|
)
|
|
def test_is_entitlement_failure_matches_real_xai_bodies(message):
|
|
from run_agent import AIAgent
|
|
|
|
assert AIAgent._is_entitlement_failure(
|
|
{"message": message, "reason": "permission_denied"},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_status_other_than_401_403():
|
|
"""200/429/500 must never be classified as entitlement, even if body matches."""
|
|
from run_agent import AIAgent
|
|
|
|
body = {
|
|
"message": "do not have an active Grok subscription",
|
|
}
|
|
assert not AIAgent._is_entitlement_failure(body, 500)
|
|
assert not AIAgent._is_entitlement_failure(body, 429)
|
|
assert not AIAgent._is_entitlement_failure(body, 200)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_unrelated_auth_errors():
|
|
"""A real auth failure (expired token, wrong key) must keep refreshing."""
|
|
from run_agent import AIAgent
|
|
|
|
# Generic Anthropic-style auth failure
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{"message": "Invalid API key", "reason": "authentication_error"},
|
|
401,
|
|
)
|
|
# OAuth token expired
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{"message": "Token has expired", "reason": "unauthorized"},
|
|
401,
|
|
)
|
|
# Empty context
|
|
assert not AIAgent._is_entitlement_failure({}, 401)
|
|
assert not AIAgent._is_entitlement_failure(None, 401)
|
|
|
|
|
|
def test_recover_with_credential_pool_skips_refresh_on_entitlement_403():
|
|
"""The recovery path must NOT call pool.try_refresh_current() on entitlement 403.
|
|
|
|
Before the fix, an unsubscribed xAI OAuth account would burn the agent
|
|
loop indefinitely: refresh → 403 → refresh → 403, infinitely. With
|
|
the entitlement guard, recovery returns False so the error surfaces
|
|
normally with the friendly hint from _summarize_api_error.
|
|
"""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
# Wire a fake credential pool that records refresh attempts.
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
error_context = {
|
|
"reason": "The caller does not have permission to execute the specified operation",
|
|
"message": "You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com",
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Entitlement 403 must surface, not silently recover"
|
|
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on entitlement 403"
|
|
|
|
|
|
def test_recover_with_credential_pool_skips_refresh_on_bare_403_for_xai_oauth():
|
|
"""A bare HTTP 403 from ``xai-oauth`` (no keyword match) must NOT loop refresh.
|
|
|
|
Regression for #26847 — xAI's backend has been seen to 403 standard
|
|
SuperGrok subscribers with a terser body that doesn't contain any of
|
|
the existing entitlement keywords ("do not have an active Grok
|
|
subscription", etc.). Before the defense-in-depth guard, the recovery
|
|
path would happily mint a fresh token, get a fresh 403, and spin.
|
|
"""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
assert agent.provider == "xai-oauth"
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
error_context = {
|
|
"reason": "forbidden",
|
|
"message": "Forbidden",
|
|
}
|
|
assert not AIAgent._is_entitlement_failure(error_context, 403), (
|
|
"Pre-condition: bare 'Forbidden' body must NOT match the keyword "
|
|
"heuristic — otherwise this test isn't covering the defense-in-depth path."
|
|
)
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Bare 403 on xai-oauth must surface, not refresh-loop"
|
|
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on xai-oauth 403"
|
|
|
|
|
|
def test_recover_with_credential_pool_still_refreshes_genuine_auth_failure():
|
|
"""Regression guard: legitimate auth errors must still trigger refresh."""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
# Return a fake refreshed entry — semantically "refresh worked"
|
|
entry = MagicMock()
|
|
entry.id = "entry_refreshed"
|
|
return entry
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
# _swap_credential is called by the recovery path — stub it out
|
|
agent._swap_credential = MagicMock()
|
|
|
|
error_context = {
|
|
"reason": "authentication_error",
|
|
"message": "Invalid API key",
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=401,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is True, "Genuine auth failure must still recover via refresh"
|
|
assert refresh_calls["n"] == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D-bis: bad-credentials 403 must NOT be classified as entitlement (#29344)
|
|
#
|
|
# xAI returns the same permission-denied ``code`` text for two distinct
|
|
# conditions: unsubscribed account vs. stale OAuth access token. The
|
|
# ``error`` field's ``[WKE=unauthenticated:...]`` suffix (and the
|
|
# accompanying "OAuth2 access token could not be validated" phrasing) is
|
|
# xAI's authoritative disambiguator — when present, the body is an auth
|
|
# failure, not entitlement, and the credential-pool refresh path must
|
|
# run. Pre-fix, long-running TUI sessions stuck on a stale token
|
|
# surfaced as a non-retryable client error; the workaround was to exit
|
|
# and reopen the TUI so the startup-resolve path refreshed.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_bad_credentials_wke_suffix():
|
|
"""403 with ``[WKE=unauthenticated:bad-credentials]`` is auth, not entitlement.
|
|
|
|
Verbatim shape from the #29344 reporter — the ``code`` text matches
|
|
the entitlement permission-denied heuristic, but the ``error`` field
|
|
carries xAI's explicit "this is a credential validation failure"
|
|
signal. Classifier must honor it.
|
|
"""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_wke_suffix_in_normalized_shape():
|
|
"""The same body after ``_extract_api_error_context`` normalisation.
|
|
|
|
Real runtime paths feed the classifier through
|
|
``_extract_api_error_context``, which converts the raw body to
|
|
``{message, reason, reset_at}``. The disambiguator must fire in
|
|
BOTH the raw-body shape (test above) and the normalised shape so
|
|
the fix actually reaches the production call site at
|
|
``_recover_with_credential_pool``.
|
|
"""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"reason": "The caller does not have permission to execute the specified operation",
|
|
"message": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("wke_variant", [
|
|
# The headline variant — what xAI returns today.
|
|
"[WKE=unauthenticated:bad-credentials]",
|
|
# Forward-compat: xAI documents the WKE prefix as a stable shape,
|
|
# the suffix after the colon is the "reason code" and could grow
|
|
# new values. Anything under ``unauthenticated:`` must route to
|
|
# the refresh path.
|
|
"[WKE=unauthenticated:expired-token]",
|
|
"[WKE=unauthenticated:revoked]",
|
|
"[WKE=unauthenticated:some-future-reason]",
|
|
])
|
|
def test_is_entitlement_failure_false_for_any_wke_unauthenticated_variant(wke_variant):
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": f"Token rejected. {wke_variant}",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_via_oauth2_validation_phrase_alone():
|
|
"""Second disambiguator: the "OAuth2 access token could not be
|
|
validated" phrase by itself (no WKE suffix) must also route to
|
|
refresh. This is a belt-and-braces guard against xAI dropping or
|
|
reformatting the WKE suffix in a future API revision without
|
|
changing the human-readable error text."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "The OAuth2 access token could not be validated.",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_wke_signal_overrides_entitlement_keywords():
|
|
"""Defensive: if a future xAI body somehow carries BOTH the WKE
|
|
suffix AND entitlement language, the WKE signal wins. Auth is
|
|
recoverable; entitlement isn't. If the refreshed token still
|
|
can't access the resource, the next 403 (without WKE) lands on
|
|
the entitlement path correctly."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": (
|
|
"do not have an active Grok subscription. "
|
|
"[WKE=unauthenticated:bad-credentials]"
|
|
),
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_case_insensitive_wke_match():
|
|
"""Substring match is case-insensitive — the classifier lowercases
|
|
everything before matching, so a future xAI build that uppercases
|
|
the prefix wouldn't reintroduce the misclassification."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "[wke=Unauthenticated:Bad-Credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_recover_with_credential_pool_refreshes_on_xai_bad_credentials_403():
|
|
"""End-to-end #29344: a bad-credentials 403 from xai-oauth MUST
|
|
call ``try_refresh_current()`` so the long-running TUI session
|
|
recovers without an exit/reopen cycle.
|
|
|
|
Mirrors the scaffolding of
|
|
``test_recover_with_credential_pool_still_refreshes_genuine_auth_failure``
|
|
but with the exact 403 body shape xAI ships for stale tokens —
|
|
the very body that pre-fix tripped the entitlement classifier
|
|
and short-circuited the refresh path.
|
|
"""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
entry = MagicMock()
|
|
entry.id = "entry_refreshed_after_stale"
|
|
return entry
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
agent._swap_credential = MagicMock()
|
|
|
|
# Normalised shape that ``_extract_api_error_context`` would
|
|
# produce for the reporter's wire-level body.
|
|
error_context = {
|
|
"reason": (
|
|
"The caller does not have permission to execute the specified operation"
|
|
),
|
|
"message": (
|
|
"The OAuth2 access token could not be validated. "
|
|
"[WKE=unauthenticated:bad-credentials]"
|
|
),
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is True, (
|
|
"Stale OAuth token (bad-credentials 403) must trigger refresh — "
|
|
"pre-fix this returned False because the entitlement classifier "
|
|
"over-matched on the permission-denied code text"
|
|
)
|
|
assert refresh_calls["n"] == 1, "try_refresh_current must run exactly once"
|
|
agent._swap_credential.assert_called_once()
|
|
|
|
|
|
def test_recover_with_credential_pool_still_blocks_real_entitlement():
|
|
"""Companion regression guard for the #29344 fix: the original
|
|
#26847 protection — entitlement 403 must NOT refresh — must
|
|
survive the new disambiguator. A real unsubscribed-account body
|
|
has no WKE suffix and no OAuth2-validation phrase, so the
|
|
classifier still classifies it as entitlement and short-circuits."""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
# Pure entitlement body — no WKE suffix, no OAuth2 phrase.
|
|
error_context = {
|
|
"reason": (
|
|
"The caller does not have permission to execute the specified operation"
|
|
),
|
|
"message": (
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com"
|
|
),
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Entitlement 403 must surface, not refresh"
|
|
assert refresh_calls["n"] == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix E: grok-4.3 context length must be 1M, not 256K
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_grok_4_3_context_length_is_1m():
|
|
"""grok-4.3 ships with 1M context per docs.x.ai/developers/models/grok-4.3.
|
|
|
|
Hermes' substring-match fallback used to return 256k (from the
|
|
"grok-4" catch-all) which under-reported the model's real capacity.
|
|
"""
|
|
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
|
|
|
|
# The entry exists with the expected value.
|
|
assert DEFAULT_CONTEXT_LENGTHS["grok-4.3"] == 1_000_000
|
|
|
|
# And longest-first substring matching resolves grok-4.3 and
|
|
# grok-4.3-latest to the new value, NOT the grok-4 catch-all.
|
|
for slug in ("grok-4.3", "grok-4.3-latest"):
|
|
matched_key = max(
|
|
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
|
|
key=len,
|
|
)
|
|
assert matched_key == "grok-4.3", (
|
|
f"Expected longest-first match to land on grok-4.3 for {slug}, "
|
|
f"got {matched_key}"
|
|
)
|
|
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 1_000_000
|
|
|
|
|
|
def test_grok_4_still_resolves_to_256k():
|
|
"""Regression guard: grok-4 (non-.3) must still resolve to 256k."""
|
|
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
|
|
|
|
for slug in ("grok-4", "grok-4-0709"):
|
|
matched_key = max(
|
|
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
|
|
key=len,
|
|
)
|
|
# grok-4-0709 contains "grok-4" but not "grok-4.3"; matched key
|
|
# must be "grok-4" (or a more specific variant family if one is
|
|
# ever added). The 256k contract must hold.
|
|
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000
|