hermes-agent/tests/run_agent/test_codex_xai_oauth_recovery.py
kshitijk4poor 66827f8947 chore: prune unused imports and duplicate import redefinitions
Remove unused imports (F401) and duplicate/shadowed import
redefinitions (F811) across the codebase using ruff's safe
autofixes. No behavioral changes -- imports only.

- ~1400 safe autofixes applied across 644 files (net -1072 lines)
- __init__.py re-exports preserved (excluded from F401 removal so
  public re-export surfaces stay intact)
- Re-exports that are imported or monkeypatched by tests but look
  unused in their defining module are kept with explicit # noqa:
  F401 (gateway/run.py load_dotenv; run_agent re-exports from
  agent.message_sanitization, agent.context_compressor,
  agent.retry_utils, agent.prompt_builder, agent.process_bootstrap,
  agent.codex_responses_adapter)
- Unsafe F841 (unused-variable) fixes deliberately skipped -- those
  can change behavior when the RHS has side effects
- ruff lints remain disabled in pyproject.toml (only PLW1514 is
  selected); this is a one-time cleanup, not a config change

Verification:
- python -m compileall: clean
- pytest --collect-only: all 27161 tests collect (zero import errors)
- core entry points import clean (run_agent, model_tools, cli,
  toolsets, hermes_state, batch_runner, gateway)
- static scan: every name any test imports directly from an edited
  module still resolves
2026-05-28 22:26:25 -07:00

1080 lines
41 KiB
Python

"""Regression tests for the May 2026 xAI OAuth (SuperGrok / X Premium) bugs.
Three distinct failure modes the user community hit during rollout:
1. ``RuntimeError("Expected to have received `response.created` before
`error`")`` on multi-turn xAI OAuth conversations. The OpenAI SDK's
Responses streaming state machine collapses an upstream ``error`` SSE
frame into a generic stream-ordering error. ``_run_codex_stream``
now treats this the same way it already treats the missing
``response.completed`` postlude — fall back to a non-stream
``responses.create(stream=True)`` which surfaces the real provider
error. Also closes #8133 (``response.in_progress`` prelude on custom
relays) and #14634 (``codex.rate_limits`` prelude on codex-lb).
2. The HTTP 403 entitlement error xAI returns when an OAuth token lacks
SuperGrok / X Premium ("You have either run out of available
resources or do not have an active Grok subscription") used to read
as a confusing wall of JSON. ``_summarize_api_error`` now appends a
one-line hint pointing the user at https://grok.com and ``/model``.
3. Multi-turn replay of ``codex_reasoning_items`` (with
``encrypted_content``) was briefly suppressed for ``is_xai_responses``
in PR #26644 on the theory that xAI's OAuth/SuperGrok surface
rejected replayed encrypted reasoning items. That suppression was
reverted shortly after: xAI confirmed they explicitly want Hermes to
thread encrypted reasoning back across turns, and the original
multi-turn failure mode was actually the prelude-SSE issue closed by
Fix A above. The remaining tests here lock in that xAI receives
replayed reasoning AND that we ask xAI to echo it back in the
``include`` array.
"""
from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
# ---------------------------------------------------------------------------
# Fix A: prelude error surfacing via wire `error` events
#
# With the migration to ``responses.create(stream=True)`` raw event iteration,
# the SDK's high-level state-machine RuntimeError no longer mediates between
# the wire and us — we read the wire directly. When the chatgpt.com Codex
# backend (or xAI, codex-lb, custom relays) emits a ``type=error`` frame as
# its first event, our consumer raises ``_StreamErrorEvent`` straight from
# the wire payload, which carries the real provider message in ``.body`` /
# ``.message`` shape for ``_summarize_api_error`` to consume. This is
# strictly better than the old "SDK raises RuntimeError → we retry → fall
# back to a second non-stream call" two-phase dance, because the error
# surfaces on the first event instead of after one wasted round trip.
# ---------------------------------------------------------------------------
def _make_codex_agent():
"""Build a minimal AIAgent wired for codex_responses streaming tests."""
from run_agent import AIAgent
agent = AIAgent(
api_key="test-key",
base_url="https://api.x.ai/v1",
model="grok-4.3",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
agent.api_mode = "codex_responses"
agent.provider = "xai-oauth"
agent._interrupt_requested = False
return agent
@pytest.mark.parametrize(
"provider_message",
[
"You do not have an active Grok subscription",
"rate limit exceeded",
"model not available",
],
)
def test_codex_stream_wire_error_event_surfaces_stream_error_event(provider_message):
"""A wire ``type=error`` SSE frame raises ``_StreamErrorEvent`` with the
provider's real message in the body."""
from run_agent import _StreamErrorEvent
agent = _make_codex_agent()
class _ErrorCreateStream:
def __iter__(self_inner):
yield SimpleNamespace(type="error", message=provider_message, code="forbidden")
def close(self_inner):
pass
mock_client = MagicMock()
mock_client.responses.create.return_value = _ErrorCreateStream()
with pytest.raises(_StreamErrorEvent) as excinfo:
agent._run_codex_stream({}, client=mock_client)
assert provider_message in str(excinfo.value)
assert excinfo.value.body["error"]["message"] == provider_message
def test_codex_stream_retries_remote_protocol_error_once():
"""Transport errors (``httpx.RemoteProtocolError``) trigger a single retry.
Previously this was on the ``responses.stream(...)`` helper; now it's on
``responses.create(stream=True)`` itself. The user-facing behavior is the
same: one retry, then re-raise if the second attempt also fails.
"""
import httpx
agent = _make_codex_agent()
call_count = {"n": 0}
def create_side_effect(**kwargs):
call_count["n"] += 1
raise httpx.RemoteProtocolError(
"peer closed connection without sending complete message body"
)
mock_client = MagicMock()
mock_client.responses.create.side_effect = create_side_effect
with pytest.raises(httpx.RemoteProtocolError):
agent._run_codex_stream({}, client=mock_client)
# max_stream_retries=1 → one retry + final attempt → 2 create calls total.
assert call_count["n"] == 2
def test_codex_stream_unrelated_runtimeerror_still_raises():
"""RuntimeErrors that aren't transport errors must propagate.
With the event-driven path there's no separate fallback function to
short-circuit into; any RuntimeError from ``responses.create()`` or the
consumer surfaces directly.
"""
agent = _make_codex_agent()
mock_client = MagicMock()
mock_client.responses.create.side_effect = RuntimeError("something else broke")
with pytest.raises(RuntimeError, match="something else broke"):
agent._run_codex_stream({}, client=mock_client)
def test_codex_stream_truncated_no_terminal_event_raises():
"""Streams that end without a terminal event AND no items raise.
Preserves the "Codex Responses stream did not emit a terminal response"
signal callers use to distinguish "stream truncated mid-flight" from
"stream completed with empty body". Previously surfaced by the SDK's
``RuntimeError("Didn't receive a `response.completed` event.")``; now
surfaced directly by the event consumer.
"""
agent = _make_codex_agent()
class _EmptyStream:
def __iter__(self_inner):
return iter(())
def close(self_inner):
pass
mock_client = MagicMock()
mock_client.responses.create.return_value = _EmptyStream()
with pytest.raises(RuntimeError, match="did not emit a terminal response"):
agent._run_codex_stream({}, client=mock_client)
# ---------------------------------------------------------------------------
# Fix B: surface xAI's entitlement body verbatim (no editorializing)
#
# The original PR #26644 appended a hint that led with "X Premium+ does NOT
# include xAI API access — only standalone SuperGrok subscribers can use this
# provider." xAI announced on 2026-05-16 that X Premium subs now work in
# Hermes (https://x.ai/news/grok-hermes), making that hint actively wrong:
# a Premium+ user hitting a real entitlement issue (no Grok sub, wrong tier,
# exhausted quota) would be misdirected to switch subscriptions when their
# Premium sub is in fact valid. We now surface xAI's own body text verbatim
# (which already says "Manage subscriptions at https://grok.com/?_s=usage")
# and leave the diagnosis to xAI's wording.
# ---------------------------------------------------------------------------
def test_summarize_api_error_surfaces_xai_entitlement_body_verbatim():
"""xAI's OAuth 403 body must surface as-is, with no Hermes-side hint."""
from run_agent import AIAgent
error = RuntimeError(
"HTTP 403: Error code: 403 - {'code': 'The caller does not have permission "
"to execute the specified operation', 'error': 'You have either run out of "
"available resources or do not have an active Grok subscription. Manage "
"subscriptions at https://grok.com'}"
)
summary = AIAgent._summarize_api_error(error)
# xAI's own body text must reach the user — they need it to diagnose.
assert "do not have an active Grok subscription" in summary
# No stale claim that X Premium is incompatible with Hermes.
assert "X Premium+ does NOT include" not in summary
assert "standalone SuperGrok subscribers" not in summary
def test_summarize_api_error_xai_body_message_unwrapped():
"""SDK-style error with structured body surfaces the message cleanly."""
from run_agent import AIAgent
class _XaiErr(Exception):
status_code = 403
body = {
"error": {
"message": (
"You have either run out of available resources or do "
"not have an active Grok subscription. Manage at "
"https://grok.com"
)
}
}
summary = AIAgent._summarize_api_error(_XaiErr("403"))
assert "HTTP 403" in summary
assert "do not have an active Grok subscription" in summary
# No editorializing on top of xAI's own wording.
assert "X Premium+ does NOT include" not in summary
def test_summarize_api_error_passes_through_unrelated_errors():
"""Non-xAI / non-entitlement errors must not be touched."""
from run_agent import AIAgent
error = RuntimeError("HTTP 500: upstream is sad")
summary = AIAgent._summarize_api_error(error)
assert "SuperGrok" not in summary
assert "grok.com" not in summary
assert "upstream is sad" in summary
# ---------------------------------------------------------------------------
# Fix D: _StreamErrorEvent xAI entitlement classified as auth, not retryable
#
# run_codex_create_stream_fallback raises _StreamErrorEvent (status_code=None)
# when the Responses stream emits a ``type=error`` SSE frame. Before this
# fix, classify_api_error had no match for "grok subscription" in its pattern
# lists, so it returned FailoverReason.unknown (retryable=True) — burning
# max_retries before the agent stopped. _is_entitlement_failure was never
# called because it only runs when FailoverReason.auth is returned.
# ---------------------------------------------------------------------------
def test_classify_api_error_stream_event_grok_subscription_is_auth():
"""_StreamErrorEvent with xAI subscription message classifies as auth/non-retryable.
The SSE error path has status_code=None, so _classify_by_status is
skipped. The explicit pattern added at step 1 must fire first and
return auth/non-retryable so _is_entitlement_failure can stop the loop.
"""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent(
"You have either run out of available resources or do not have an "
"active Grok subscription. Manage subscriptions at https://grok.com",
code="The caller does not have permission to execute the specified operation",
)
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason == FailoverReason.auth
assert result.retryable is False
assert result.should_fallback is True
def test_classify_api_error_stream_event_resources_exhausted_grok_is_auth():
"""'out of available resources' + 'grok' variant also classifies as auth."""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent(
"You have run out of available resources for Grok.",
)
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason == FailoverReason.auth
assert result.retryable is False
def test_classify_api_error_stream_event_unrelated_not_reclassified():
"""An unrelated _StreamErrorEvent must not be caught by the xAI guard."""
from run_agent import _StreamErrorEvent
from agent.error_classifier import classify_api_error, FailoverReason
err = _StreamErrorEvent("Internal server error — try again later")
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
assert result.reason != FailoverReason.auth
# ---------------------------------------------------------------------------
# Fix C: reasoning replay gating for xai-oauth
# ---------------------------------------------------------------------------
def _assistant_msg_with_encrypted_reasoning(text="hi from grok", encrypted="enc_blob"):
return {
"role": "assistant",
"content": text,
"codex_reasoning_items": [
{
"type": "reasoning",
"id": "rs_xai_001",
"encrypted_content": encrypted,
"summary": [],
}
],
}
def test_codex_reasoning_replay_default_includes_encrypted_content():
"""Native Codex backend (default) must still replay encrypted reasoning."""
from agent.codex_responses_adapter import _chat_messages_to_responses_input
msgs = [
{"role": "user", "content": "hi"},
_assistant_msg_with_encrypted_reasoning(),
{"role": "user", "content": "what's your name?"},
]
items = _chat_messages_to_responses_input(msgs)
reasoning = [it for it in items if it.get("type") == "reasoning"]
assert len(reasoning) == 1
assert reasoning[0]["encrypted_content"] == "enc_blob"
def test_codex_reasoning_replay_includes_encrypted_content_for_xai():
"""xAI must receive replayed encrypted reasoning items (May 2026 reversal).
Earlier we stripped these on the theory that the OAuth/SuperGrok
surface rejected them. xAI subsequently confirmed they explicitly
want Hermes to thread encrypted reasoning back across turns for
cross-turn coherence — that's the whole point of the partnership
integration.
"""
from agent.codex_responses_adapter import _chat_messages_to_responses_input
msgs = [
{"role": "user", "content": "hi"},
_assistant_msg_with_encrypted_reasoning(),
{"role": "user", "content": "what's your name?"},
]
items = _chat_messages_to_responses_input(msgs, is_xai_responses=True)
reasoning = [it for it in items if it.get("type") == "reasoning"]
assert len(reasoning) == 1, (
"xAI must receive replayed reasoning items — see docstring for the "
"May 2026 reversal of the earlier suppression gate."
)
assert reasoning[0]["encrypted_content"] == "enc_blob"
# And the assistant's visible text must still be present alongside it.
assistant_items = [
it for it in items
if it.get("role") == "assistant" or it.get("type") == "message"
]
assert assistant_items, "assistant message must still be present"
def test_codex_transport_xai_request_includes_encrypted_content():
"""xAI ``include`` array must request ``reasoning.encrypted_content``.
This is the request-side half of the May 2026 reversal: we ask xAI
to echo back encrypted reasoning so the next turn can replay it.
"""
from agent.transports.codex import ResponsesApiTransport
transport = ResponsesApiTransport()
kwargs = transport.build_kwargs(
model="grok-4.3",
messages=[
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "hi"},
],
tools=None,
instructions="you are a helpful assistant",
reasoning_config={"enabled": True, "effort": "medium"},
is_xai_responses=True,
)
assert kwargs["include"] == ["reasoning.encrypted_content"]
def test_codex_transport_xai_replays_reasoning_in_input():
"""End-to-end: build_kwargs on xAI must replay prior encrypted reasoning."""
from agent.transports.codex import ResponsesApiTransport
transport = ResponsesApiTransport()
kwargs = transport.build_kwargs(
model="grok-4.3",
messages=[
{"role": "system", "content": "sys"},
{"role": "user", "content": "hi"},
_assistant_msg_with_encrypted_reasoning(text="hi from grok"),
{"role": "user", "content": "what's your name?"},
],
tools=None,
instructions="sys",
reasoning_config={"enabled": True, "effort": "medium"},
is_xai_responses=True,
)
input_items = kwargs["input"]
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
assert len(reasoning_items) == 1
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
def test_codex_transport_native_codex_still_replays_reasoning_in_input():
"""Regression guard: openai-codex must keep the existing replay path."""
from agent.transports.codex import ResponsesApiTransport
transport = ResponsesApiTransport()
kwargs = transport.build_kwargs(
model="gpt-5-codex",
messages=[
{"role": "system", "content": "sys"},
{"role": "user", "content": "hi"},
_assistant_msg_with_encrypted_reasoning(text="hi from codex"),
{"role": "user", "content": "next"},
],
tools=None,
instructions="sys",
reasoning_config={"enabled": True, "effort": "medium"},
is_xai_responses=False,
)
input_items = kwargs["input"]
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
assert len(reasoning_items) == 1
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
# Native Codex still asks for encrypted_content back.
assert "reasoning.encrypted_content" in kwargs.get("include", [])
# ---------------------------------------------------------------------------
# Fix D: entitlement 403 must NOT trigger credential-pool refresh loop
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"message",
[
# The exact wire text RaidenTyler and Don Piedro captured.
"You have either run out of available resources or do not have an "
"active Grok subscription. Manage at https://grok.com",
# Permission-style variant from the same 403 body.
"The caller does not have permission to execute the specified "
"operation for grok-4.3",
],
)
def test_is_entitlement_failure_matches_real_xai_bodies(message):
from run_agent import AIAgent
assert AIAgent._is_entitlement_failure(
{"message": message, "reason": "permission_denied"},
403,
)
def test_is_entitlement_failure_false_for_status_other_than_401_403():
"""200/429/500 must never be classified as entitlement, even if body matches."""
from run_agent import AIAgent
body = {
"message": "do not have an active Grok subscription",
}
assert not AIAgent._is_entitlement_failure(body, 500)
assert not AIAgent._is_entitlement_failure(body, 429)
assert not AIAgent._is_entitlement_failure(body, 200)
def test_is_entitlement_failure_false_for_unrelated_auth_errors():
"""A real auth failure (expired token, wrong key) must keep refreshing."""
from run_agent import AIAgent
# Generic Anthropic-style auth failure
assert not AIAgent._is_entitlement_failure(
{"message": "Invalid API key", "reason": "authentication_error"},
401,
)
# OAuth token expired
assert not AIAgent._is_entitlement_failure(
{"message": "Token has expired", "reason": "unauthorized"},
401,
)
# Empty context
assert not AIAgent._is_entitlement_failure({}, 401)
assert not AIAgent._is_entitlement_failure(None, 401)
def test_recover_with_credential_pool_skips_refresh_on_entitlement_403():
"""The recovery path must NOT call pool.try_refresh_current() on entitlement 403.
Before the fix, an unsubscribed xAI OAuth account would burn the agent
loop indefinitely: refresh → 403 → refresh → 403, infinitely. With
the entitlement guard, recovery returns False so the error surfaces
normally with the friendly hint from _summarize_api_error.
"""
from agent.error_classifier import FailoverReason
agent = _make_codex_agent()
# Wire a fake credential pool that records refresh attempts.
refresh_calls = {"n": 0}
class _FakePool:
def try_refresh_current(self):
refresh_calls["n"] += 1
return MagicMock(id="should_not_be_called")
def mark_exhausted_and_rotate(self, **_kwargs):
return None
def has_available(self):
return False
agent._credential_pool = _FakePool()
error_context = {
"reason": "The caller does not have permission to execute the specified operation",
"message": "You have either run out of available resources or do not have an "
"active Grok subscription. Manage at https://grok.com",
}
recovered, _retried_429 = agent._recover_with_credential_pool(
status_code=403,
has_retried_429=False,
classified_reason=FailoverReason.auth,
error_context=error_context,
)
assert recovered is False, "Entitlement 403 must surface, not silently recover"
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on entitlement 403"
def test_recover_with_credential_pool_skips_refresh_on_bare_403_for_xai_oauth():
"""A bare HTTP 403 from ``xai-oauth`` (no keyword match) must NOT loop refresh.
Regression for #26847 — xAI's backend has been seen to 403 standard
SuperGrok subscribers with a terser body that doesn't contain any of
the existing entitlement keywords ("do not have an active Grok
subscription", etc.). Before the defense-in-depth guard, the recovery
path would happily mint a fresh token, get a fresh 403, and spin.
"""
from run_agent import AIAgent
from agent.error_classifier import FailoverReason
agent = _make_codex_agent()
assert agent.provider == "xai-oauth"
refresh_calls = {"n": 0}
class _FakePool:
def try_refresh_current(self):
refresh_calls["n"] += 1
return MagicMock(id="should_not_be_called")
def mark_exhausted_and_rotate(self, **_kwargs):
return None
def has_available(self):
return False
agent._credential_pool = _FakePool()
error_context = {
"reason": "forbidden",
"message": "Forbidden",
}
assert not AIAgent._is_entitlement_failure(error_context, 403), (
"Pre-condition: bare 'Forbidden' body must NOT match the keyword "
"heuristic — otherwise this test isn't covering the defense-in-depth path."
)
recovered, _retried_429 = agent._recover_with_credential_pool(
status_code=403,
has_retried_429=False,
classified_reason=FailoverReason.auth,
error_context=error_context,
)
assert recovered is False, "Bare 403 on xai-oauth must surface, not refresh-loop"
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on xai-oauth 403"
def test_recover_with_credential_pool_still_refreshes_genuine_auth_failure():
"""Regression guard: legitimate auth errors must still trigger refresh."""
from agent.error_classifier import FailoverReason
agent = _make_codex_agent()
refresh_calls = {"n": 0}
class _FakePool:
def try_refresh_current(self):
refresh_calls["n"] += 1
# Return a fake refreshed entry — semantically "refresh worked"
entry = MagicMock()
entry.id = "entry_refreshed"
return entry
def mark_exhausted_and_rotate(self, **_kwargs):
return None
def has_available(self):
return False
agent._credential_pool = _FakePool()
# _swap_credential is called by the recovery path — stub it out
agent._swap_credential = MagicMock()
error_context = {
"reason": "authentication_error",
"message": "Invalid API key",
}
recovered, _retried_429 = agent._recover_with_credential_pool(
status_code=401,
has_retried_429=False,
classified_reason=FailoverReason.auth,
error_context=error_context,
)
assert recovered is True, "Genuine auth failure must still recover via refresh"
assert refresh_calls["n"] == 1
# ---------------------------------------------------------------------------
# Fix D-bis: bad-credentials 403 must NOT be classified as entitlement (#29344)
#
# xAI returns the same permission-denied ``code`` text for two distinct
# conditions: unsubscribed account vs. stale OAuth access token. The
# ``error`` field's ``[WKE=unauthenticated:...]`` suffix (and the
# accompanying "OAuth2 access token could not be validated" phrasing) is
# xAI's authoritative disambiguator — when present, the body is an auth
# failure, not entitlement, and the credential-pool refresh path must
# run. Pre-fix, long-running TUI sessions stuck on a stale token
# surfaced as a non-retryable client error; the workaround was to exit
# and reopen the TUI so the startup-resolve path refreshed.
# ---------------------------------------------------------------------------
def test_is_entitlement_failure_false_for_bad_credentials_wke_suffix():
"""403 with ``[WKE=unauthenticated:bad-credentials]`` is auth, not entitlement.
Verbatim shape from the #29344 reporter — the ``code`` text matches
the entitlement permission-denied heuristic, but the ``error`` field
carries xAI's explicit "this is a credential validation failure"
signal. Classifier must honor it.
"""
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"code": "The caller does not have permission to execute the specified operation",
"error": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
},
403,
)
def test_is_entitlement_failure_false_for_wke_suffix_in_normalized_shape():
"""The same body after ``_extract_api_error_context`` normalisation.
Real runtime paths feed the classifier through
``_extract_api_error_context``, which converts the raw body to
``{message, reason, reset_at}``. The disambiguator must fire in
BOTH the raw-body shape (test above) and the normalised shape so
the fix actually reaches the production call site at
``_recover_with_credential_pool``.
"""
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"reason": "The caller does not have permission to execute the specified operation",
"message": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
},
403,
)
@pytest.mark.parametrize("wke_variant", [
# The headline variant — what xAI returns today.
"[WKE=unauthenticated:bad-credentials]",
# Forward-compat: xAI documents the WKE prefix as a stable shape,
# the suffix after the colon is the "reason code" and could grow
# new values. Anything under ``unauthenticated:`` must route to
# the refresh path.
"[WKE=unauthenticated:expired-token]",
"[WKE=unauthenticated:revoked]",
"[WKE=unauthenticated:some-future-reason]",
])
def test_is_entitlement_failure_false_for_any_wke_unauthenticated_variant(wke_variant):
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"code": "The caller does not have permission to execute the specified operation",
"error": f"Token rejected. {wke_variant}",
},
403,
)
def test_is_entitlement_failure_false_via_oauth2_validation_phrase_alone():
"""Second disambiguator: the "OAuth2 access token could not be
validated" phrase by itself (no WKE suffix) must also route to
refresh. This is a belt-and-braces guard against xAI dropping or
reformatting the WKE suffix in a future API revision without
changing the human-readable error text."""
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"code": "The caller does not have permission to execute the specified operation",
"error": "The OAuth2 access token could not be validated.",
},
403,
)
def test_is_entitlement_failure_wke_signal_overrides_entitlement_keywords():
"""Defensive: if a future xAI body somehow carries BOTH the WKE
suffix AND entitlement language, the WKE signal wins. Auth is
recoverable; entitlement isn't. If the refreshed token still
can't access the resource, the next 403 (without WKE) lands on
the entitlement path correctly."""
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"code": "The caller does not have permission to execute the specified operation",
"error": (
"do not have an active Grok subscription. "
"[WKE=unauthenticated:bad-credentials]"
),
},
403,
)
def test_is_entitlement_failure_case_insensitive_wke_match():
"""Substring match is case-insensitive — the classifier lowercases
everything before matching, so a future xAI build that uppercases
the prefix wouldn't reintroduce the misclassification."""
from run_agent import AIAgent
assert not AIAgent._is_entitlement_failure(
{
"code": "The caller does not have permission to execute the specified operation",
"error": "[wke=Unauthenticated:Bad-Credentials]",
},
403,
)
def test_recover_with_credential_pool_refreshes_on_xai_bad_credentials_403():
"""End-to-end #29344: a bad-credentials 403 from xai-oauth MUST
call ``try_refresh_current()`` so the long-running TUI session
recovers without an exit/reopen cycle.
Mirrors the scaffolding of
``test_recover_with_credential_pool_still_refreshes_genuine_auth_failure``
but with the exact 403 body shape xAI ships for stale tokens —
the very body that pre-fix tripped the entitlement classifier
and short-circuited the refresh path.
"""
from agent.error_classifier import FailoverReason
agent = _make_codex_agent()
refresh_calls = {"n": 0}
class _FakePool:
def try_refresh_current(self):
refresh_calls["n"] += 1
entry = MagicMock()
entry.id = "entry_refreshed_after_stale"
return entry
def mark_exhausted_and_rotate(self, **_kwargs):
return None
def has_available(self):
return False
agent._credential_pool = _FakePool()
agent._swap_credential = MagicMock()
# Normalised shape that ``_extract_api_error_context`` would
# produce for the reporter's wire-level body.
error_context = {
"reason": (
"The caller does not have permission to execute the specified operation"
),
"message": (
"The OAuth2 access token could not be validated. "
"[WKE=unauthenticated:bad-credentials]"
),
}
recovered, _retried_429 = agent._recover_with_credential_pool(
status_code=403,
has_retried_429=False,
classified_reason=FailoverReason.auth,
error_context=error_context,
)
assert recovered is True, (
"Stale OAuth token (bad-credentials 403) must trigger refresh — "
"pre-fix this returned False because the entitlement classifier "
"over-matched on the permission-denied code text"
)
assert refresh_calls["n"] == 1, "try_refresh_current must run exactly once"
agent._swap_credential.assert_called_once()
def test_recover_with_credential_pool_still_blocks_real_entitlement():
"""Companion regression guard for the #29344 fix: the original
#26847 protection — entitlement 403 must NOT refresh — must
survive the new disambiguator. A real unsubscribed-account body
has no WKE suffix and no OAuth2-validation phrase, so the
classifier still classifies it as entitlement and short-circuits."""
from agent.error_classifier import FailoverReason
agent = _make_codex_agent()
refresh_calls = {"n": 0}
class _FakePool:
def try_refresh_current(self):
refresh_calls["n"] += 1
return MagicMock(id="should_not_be_called")
def mark_exhausted_and_rotate(self, **_kwargs):
return None
def has_available(self):
return False
agent._credential_pool = _FakePool()
# Pure entitlement body — no WKE suffix, no OAuth2 phrase.
error_context = {
"reason": (
"The caller does not have permission to execute the specified operation"
),
"message": (
"You have either run out of available resources or do not have an "
"active Grok subscription. Manage at https://grok.com"
),
}
recovered, _retried_429 = agent._recover_with_credential_pool(
status_code=403,
has_retried_429=False,
classified_reason=FailoverReason.auth,
error_context=error_context,
)
assert recovered is False, "Entitlement 403 must surface, not refresh"
assert refresh_calls["n"] == 0
# ---------------------------------------------------------------------------
# Fix E: grok-4.3 context length must be 1M, not 256K
# ---------------------------------------------------------------------------
def test_grok_4_3_context_length_is_1m():
"""grok-4.3 ships with 1M context per docs.x.ai/developers/models/grok-4.3.
Hermes' substring-match fallback used to return 256k (from the
"grok-4" catch-all) which under-reported the model's real capacity.
"""
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
# The entry exists with the expected value.
assert DEFAULT_CONTEXT_LENGTHS["grok-4.3"] == 1_000_000
# And longest-first substring matching resolves grok-4.3 and
# grok-4.3-latest to the new value, NOT the grok-4 catch-all.
for slug in ("grok-4.3", "grok-4.3-latest"):
matched_key = max(
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
key=len,
)
assert matched_key == "grok-4.3", (
f"Expected longest-first match to land on grok-4.3 for {slug}, "
f"got {matched_key}"
)
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 1_000_000
def test_grok_4_still_resolves_to_256k():
"""Regression guard: grok-4 (non-.3) must still resolve to 256k."""
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
for slug in ("grok-4", "grok-4-0709"):
matched_key = max(
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
key=len,
)
# grok-4-0709 contains "grok-4" but not "grok-4.3"; matched key
# must be "grok-4" (or a more specific variant family if one is
# ever added). The 256k contract must hold.
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000
# ---------------------------------------------------------------------------
# Cross-issuer reasoning replay guard
#
# When a session switches model providers mid-conversation (e.g. user runs
# /model gpt-5.5 after several turns on grok-4.3), the persisted reasoning
# items carry encrypted_content that only the issuing endpoint can decrypt.
# Replaying them against the new endpoint deterministically returns HTTP 400
# invalid_encrypted_content and breaks every subsequent turn. The cross-issuer
# guard stamps each reasoning item with its issuer on normalize and drops
# foreign-issuer items on replay.
# ---------------------------------------------------------------------------
def _stamped_assistant_msg(issuer_kind, *, text="hi", encrypted="enc_blob", rs_id="rs_001"):
return {
"role": "assistant",
"content": text,
"codex_reasoning_items": [
{
"type": "reasoning",
"id": rs_id,
"encrypted_content": encrypted,
"summary": [],
"_issuer_kind": issuer_kind,
}
],
}
def test_cross_issuer_reasoning_is_dropped_on_replay():
"""Reasoning minted by one Responses endpoint must not be replayed to
another. This is the regression for the chatgpt-backend vs xAI-OAuth
swap that returned invalid_encrypted_content on every turn after the
user changed model mid-session.
"""
from agent.codex_responses_adapter import _chat_messages_to_responses_input
msgs = [
{"role": "user", "content": "hi"},
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
{"role": "user", "content": "next"},
]
# Calling against codex_backend — the grok-issued blob must be dropped.
items = _chat_messages_to_responses_input(
msgs, current_issuer_kind="codex_backend"
)
reasoning = [it for it in items if it.get("type") == "reasoning"]
assert reasoning == [], (
"Reasoning items stamped with a foreign _issuer_kind must be dropped "
"before the API rejects the whole request with invalid_encrypted_content."
)
def test_same_issuer_reasoning_is_still_replayed():
"""Same-endpoint reasoning replay is the documented happy path (May 2026
reversal). The cross-issuer guard must not regress it.
"""
from agent.codex_responses_adapter import _chat_messages_to_responses_input
msgs = [
{"role": "user", "content": "hi"},
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
{"role": "user", "content": "next"},
]
items = _chat_messages_to_responses_input(
msgs, current_issuer_kind="xai_responses"
)
reasoning = [it for it in items if it.get("type") == "reasoning"]
assert len(reasoning) == 1
assert reasoning[0]["encrypted_content"] == "grok_blob"
# The internal stamp must not leak to the API payload.
assert "_issuer_kind" not in reasoning[0]
def test_unstamped_reasoning_is_replayed_for_backwards_compat():
"""Reasoning items persisted before this patch don't carry _issuer_kind.
They must still be replayed (legacy-compatible behaviour).
"""
from agent.codex_responses_adapter import _chat_messages_to_responses_input
msgs = [
{"role": "user", "content": "hi"},
{
"role": "assistant",
"content": "hello",
"codex_reasoning_items": [
{
"type": "reasoning",
"id": "rs_legacy",
"encrypted_content": "legacy_blob",
"summary": [],
}
],
},
{"role": "user", "content": "next"},
]
items = _chat_messages_to_responses_input(
msgs, current_issuer_kind="codex_backend"
)
reasoning = [it for it in items if it.get("type") == "reasoning"]
assert len(reasoning) == 1
assert reasoning[0]["encrypted_content"] == "legacy_blob"
def test_normalize_codex_response_stamps_issuer_on_reasoning():
"""Reasoning captured from a response must be stamped with the issuer so
a later replay against a different endpoint can drop it.
"""
from types import SimpleNamespace
from agent.codex_responses_adapter import _normalize_codex_response
reasoning_item = SimpleNamespace(
type="reasoning",
id="rs_new",
encrypted_content="fresh_blob",
summary=[],
)
message_item = SimpleNamespace(
type="message",
role="assistant",
status="completed",
content=[SimpleNamespace(type="output_text", text="ok")],
id="msg_1",
)
response = SimpleNamespace(output=[reasoning_item, message_item], status="completed")
msg, _ = _normalize_codex_response(response, issuer_kind="xai_responses")
assert msg.codex_reasoning_items and len(msg.codex_reasoning_items) == 1
assert msg.codex_reasoning_items[0]["_issuer_kind"] == "xai_responses"
assert msg.codex_reasoning_items[0]["encrypted_content"] == "fresh_blob"
def test_transport_round_trip_drops_foreign_reasoning():
"""Full transport flow: build_kwargs against codex_backend after grok turns
must produce an `input` array that contains zero foreign reasoning items.
"""
from agent.transports.codex import ResponsesApiTransport
transport = ResponsesApiTransport()
messages = [
{"role": "system", "content": "you are hermes"},
{"role": "user", "content": "hi"},
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
{"role": "user", "content": "엑스다임 프로젝트 파악, 스킬로 정리."},
]
kwargs = transport.build_kwargs(
model="gpt-5.5",
messages=messages,
tools=None,
is_codex_backend=True,
is_xai_responses=False,
is_github_responses=False,
base_url="https://chatgpt.com/backend-api/codex",
instructions="you are hermes",
)
reasoning = [it for it in kwargs["input"] if it.get("type") == "reasoning"]
assert reasoning == [], (
"Cross-issuer reasoning leaked through build_kwargs — this is the "
"exact regression that broke session 40de1ae0 on 2026-05-25 01:09."
)