mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
Remove unused imports (F401) and duplicate/shadowed import redefinitions (F811) across the codebase using ruff's safe autofixes. No behavioral changes -- imports only. - ~1400 safe autofixes applied across 644 files (net -1072 lines) - __init__.py re-exports preserved (excluded from F401 removal so public re-export surfaces stay intact) - Re-exports that are imported or monkeypatched by tests but look unused in their defining module are kept with explicit # noqa: F401 (gateway/run.py load_dotenv; run_agent re-exports from agent.message_sanitization, agent.context_compressor, agent.retry_utils, agent.prompt_builder, agent.process_bootstrap, agent.codex_responses_adapter) - Unsafe F841 (unused-variable) fixes deliberately skipped -- those can change behavior when the RHS has side effects - ruff lints remain disabled in pyproject.toml (only PLW1514 is selected); this is a one-time cleanup, not a config change Verification: - python -m compileall: clean - pytest --collect-only: all 27161 tests collect (zero import errors) - core entry points import clean (run_agent, model_tools, cli, toolsets, hermes_state, batch_runner, gateway) - static scan: every name any test imports directly from an edited module still resolves
1080 lines
41 KiB
Python
1080 lines
41 KiB
Python
"""Regression tests for the May 2026 xAI OAuth (SuperGrok / X Premium) bugs.
|
|
|
|
Three distinct failure modes the user community hit during rollout:
|
|
|
|
1. ``RuntimeError("Expected to have received `response.created` before
|
|
`error`")`` on multi-turn xAI OAuth conversations. The OpenAI SDK's
|
|
Responses streaming state machine collapses an upstream ``error`` SSE
|
|
frame into a generic stream-ordering error. ``_run_codex_stream``
|
|
now treats this the same way it already treats the missing
|
|
``response.completed`` postlude — fall back to a non-stream
|
|
``responses.create(stream=True)`` which surfaces the real provider
|
|
error. Also closes #8133 (``response.in_progress`` prelude on custom
|
|
relays) and #14634 (``codex.rate_limits`` prelude on codex-lb).
|
|
|
|
2. The HTTP 403 entitlement error xAI returns when an OAuth token lacks
|
|
SuperGrok / X Premium ("You have either run out of available
|
|
resources or do not have an active Grok subscription") used to read
|
|
as a confusing wall of JSON. ``_summarize_api_error`` now appends a
|
|
one-line hint pointing the user at https://grok.com and ``/model``.
|
|
|
|
3. Multi-turn replay of ``codex_reasoning_items`` (with
|
|
``encrypted_content``) was briefly suppressed for ``is_xai_responses``
|
|
in PR #26644 on the theory that xAI's OAuth/SuperGrok surface
|
|
rejected replayed encrypted reasoning items. That suppression was
|
|
reverted shortly after: xAI confirmed they explicitly want Hermes to
|
|
thread encrypted reasoning back across turns, and the original
|
|
multi-turn failure mode was actually the prelude-SSE issue closed by
|
|
Fix A above. The remaining tests here lock in that xAI receives
|
|
replayed reasoning AND that we ask xAI to echo it back in the
|
|
``include`` array.
|
|
"""
|
|
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix A: prelude error surfacing via wire `error` events
|
|
#
|
|
# With the migration to ``responses.create(stream=True)`` raw event iteration,
|
|
# the SDK's high-level state-machine RuntimeError no longer mediates between
|
|
# the wire and us — we read the wire directly. When the chatgpt.com Codex
|
|
# backend (or xAI, codex-lb, custom relays) emits a ``type=error`` frame as
|
|
# its first event, our consumer raises ``_StreamErrorEvent`` straight from
|
|
# the wire payload, which carries the real provider message in ``.body`` /
|
|
# ``.message`` shape for ``_summarize_api_error`` to consume. This is
|
|
# strictly better than the old "SDK raises RuntimeError → we retry → fall
|
|
# back to a second non-stream call" two-phase dance, because the error
|
|
# surfaces on the first event instead of after one wasted round trip.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_codex_agent():
|
|
"""Build a minimal AIAgent wired for codex_responses streaming tests."""
|
|
from run_agent import AIAgent
|
|
|
|
agent = AIAgent(
|
|
api_key="test-key",
|
|
base_url="https://api.x.ai/v1",
|
|
model="grok-4.3",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
agent.api_mode = "codex_responses"
|
|
agent.provider = "xai-oauth"
|
|
agent._interrupt_requested = False
|
|
return agent
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"provider_message",
|
|
[
|
|
"You do not have an active Grok subscription",
|
|
"rate limit exceeded",
|
|
"model not available",
|
|
],
|
|
)
|
|
def test_codex_stream_wire_error_event_surfaces_stream_error_event(provider_message):
|
|
"""A wire ``type=error`` SSE frame raises ``_StreamErrorEvent`` with the
|
|
provider's real message in the body."""
|
|
from run_agent import _StreamErrorEvent
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
class _ErrorCreateStream:
|
|
def __iter__(self_inner):
|
|
yield SimpleNamespace(type="error", message=provider_message, code="forbidden")
|
|
|
|
def close(self_inner):
|
|
pass
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.create.return_value = _ErrorCreateStream()
|
|
|
|
with pytest.raises(_StreamErrorEvent) as excinfo:
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
assert provider_message in str(excinfo.value)
|
|
assert excinfo.value.body["error"]["message"] == provider_message
|
|
|
|
|
|
def test_codex_stream_retries_remote_protocol_error_once():
|
|
"""Transport errors (``httpx.RemoteProtocolError``) trigger a single retry.
|
|
|
|
Previously this was on the ``responses.stream(...)`` helper; now it's on
|
|
``responses.create(stream=True)`` itself. The user-facing behavior is the
|
|
same: one retry, then re-raise if the second attempt also fails.
|
|
"""
|
|
import httpx
|
|
|
|
agent = _make_codex_agent()
|
|
call_count = {"n": 0}
|
|
|
|
def create_side_effect(**kwargs):
|
|
call_count["n"] += 1
|
|
raise httpx.RemoteProtocolError(
|
|
"peer closed connection without sending complete message body"
|
|
)
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.create.side_effect = create_side_effect
|
|
|
|
with pytest.raises(httpx.RemoteProtocolError):
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
# max_stream_retries=1 → one retry + final attempt → 2 create calls total.
|
|
assert call_count["n"] == 2
|
|
|
|
|
|
def test_codex_stream_unrelated_runtimeerror_still_raises():
|
|
"""RuntimeErrors that aren't transport errors must propagate.
|
|
|
|
With the event-driven path there's no separate fallback function to
|
|
short-circuit into; any RuntimeError from ``responses.create()`` or the
|
|
consumer surfaces directly.
|
|
"""
|
|
agent = _make_codex_agent()
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.create.side_effect = RuntimeError("something else broke")
|
|
|
|
with pytest.raises(RuntimeError, match="something else broke"):
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
|
|
def test_codex_stream_truncated_no_terminal_event_raises():
|
|
"""Streams that end without a terminal event AND no items raise.
|
|
|
|
Preserves the "Codex Responses stream did not emit a terminal response"
|
|
signal callers use to distinguish "stream truncated mid-flight" from
|
|
"stream completed with empty body". Previously surfaced by the SDK's
|
|
``RuntimeError("Didn't receive a `response.completed` event.")``; now
|
|
surfaced directly by the event consumer.
|
|
"""
|
|
agent = _make_codex_agent()
|
|
|
|
class _EmptyStream:
|
|
def __iter__(self_inner):
|
|
return iter(())
|
|
|
|
def close(self_inner):
|
|
pass
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.responses.create.return_value = _EmptyStream()
|
|
|
|
with pytest.raises(RuntimeError, match="did not emit a terminal response"):
|
|
agent._run_codex_stream({}, client=mock_client)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix B: surface xAI's entitlement body verbatim (no editorializing)
|
|
#
|
|
# The original PR #26644 appended a hint that led with "X Premium+ does NOT
|
|
# include xAI API access — only standalone SuperGrok subscribers can use this
|
|
# provider." xAI announced on 2026-05-16 that X Premium subs now work in
|
|
# Hermes (https://x.ai/news/grok-hermes), making that hint actively wrong:
|
|
# a Premium+ user hitting a real entitlement issue (no Grok sub, wrong tier,
|
|
# exhausted quota) would be misdirected to switch subscriptions when their
|
|
# Premium sub is in fact valid. We now surface xAI's own body text verbatim
|
|
# (which already says "Manage subscriptions at https://grok.com/?_s=usage")
|
|
# and leave the diagnosis to xAI's wording.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_summarize_api_error_surfaces_xai_entitlement_body_verbatim():
|
|
"""xAI's OAuth 403 body must surface as-is, with no Hermes-side hint."""
|
|
from run_agent import AIAgent
|
|
|
|
error = RuntimeError(
|
|
"HTTP 403: Error code: 403 - {'code': 'The caller does not have permission "
|
|
"to execute the specified operation', 'error': 'You have either run out of "
|
|
"available resources or do not have an active Grok subscription. Manage "
|
|
"subscriptions at https://grok.com'}"
|
|
)
|
|
summary = AIAgent._summarize_api_error(error)
|
|
# xAI's own body text must reach the user — they need it to diagnose.
|
|
assert "do not have an active Grok subscription" in summary
|
|
# No stale claim that X Premium is incompatible with Hermes.
|
|
assert "X Premium+ does NOT include" not in summary
|
|
assert "standalone SuperGrok subscribers" not in summary
|
|
|
|
|
|
def test_summarize_api_error_xai_body_message_unwrapped():
|
|
"""SDK-style error with structured body surfaces the message cleanly."""
|
|
from run_agent import AIAgent
|
|
|
|
class _XaiErr(Exception):
|
|
status_code = 403
|
|
body = {
|
|
"error": {
|
|
"message": (
|
|
"You have either run out of available resources or do "
|
|
"not have an active Grok subscription. Manage at "
|
|
"https://grok.com"
|
|
)
|
|
}
|
|
}
|
|
|
|
summary = AIAgent._summarize_api_error(_XaiErr("403"))
|
|
assert "HTTP 403" in summary
|
|
assert "do not have an active Grok subscription" in summary
|
|
# No editorializing on top of xAI's own wording.
|
|
assert "X Premium+ does NOT include" not in summary
|
|
|
|
|
|
def test_summarize_api_error_passes_through_unrelated_errors():
|
|
"""Non-xAI / non-entitlement errors must not be touched."""
|
|
from run_agent import AIAgent
|
|
|
|
error = RuntimeError("HTTP 500: upstream is sad")
|
|
summary = AIAgent._summarize_api_error(error)
|
|
assert "SuperGrok" not in summary
|
|
assert "grok.com" not in summary
|
|
assert "upstream is sad" in summary
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D: _StreamErrorEvent xAI entitlement classified as auth, not retryable
|
|
#
|
|
# run_codex_create_stream_fallback raises _StreamErrorEvent (status_code=None)
|
|
# when the Responses stream emits a ``type=error`` SSE frame. Before this
|
|
# fix, classify_api_error had no match for "grok subscription" in its pattern
|
|
# lists, so it returned FailoverReason.unknown (retryable=True) — burning
|
|
# max_retries before the agent stopped. _is_entitlement_failure was never
|
|
# called because it only runs when FailoverReason.auth is returned.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_classify_api_error_stream_event_grok_subscription_is_auth():
|
|
"""_StreamErrorEvent with xAI subscription message classifies as auth/non-retryable.
|
|
|
|
The SSE error path has status_code=None, so _classify_by_status is
|
|
skipped. The explicit pattern added at step 1 must fire first and
|
|
return auth/non-retryable so _is_entitlement_failure can stop the loop.
|
|
"""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent(
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage subscriptions at https://grok.com",
|
|
code="The caller does not have permission to execute the specified operation",
|
|
)
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason == FailoverReason.auth
|
|
assert result.retryable is False
|
|
assert result.should_fallback is True
|
|
|
|
|
|
def test_classify_api_error_stream_event_resources_exhausted_grok_is_auth():
|
|
"""'out of available resources' + 'grok' variant also classifies as auth."""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent(
|
|
"You have run out of available resources for Grok.",
|
|
)
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason == FailoverReason.auth
|
|
assert result.retryable is False
|
|
|
|
|
|
def test_classify_api_error_stream_event_unrelated_not_reclassified():
|
|
"""An unrelated _StreamErrorEvent must not be caught by the xAI guard."""
|
|
from run_agent import _StreamErrorEvent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
|
|
err = _StreamErrorEvent("Internal server error — try again later")
|
|
result = classify_api_error(err, provider="xai-oauth", model="grok-4.3")
|
|
assert result.reason != FailoverReason.auth
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix C: reasoning replay gating for xai-oauth
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _assistant_msg_with_encrypted_reasoning(text="hi from grok", encrypted="enc_blob"):
|
|
return {
|
|
"role": "assistant",
|
|
"content": text,
|
|
"codex_reasoning_items": [
|
|
{
|
|
"type": "reasoning",
|
|
"id": "rs_xai_001",
|
|
"encrypted_content": encrypted,
|
|
"summary": [],
|
|
}
|
|
],
|
|
}
|
|
|
|
|
|
def test_codex_reasoning_replay_default_includes_encrypted_content():
|
|
"""Native Codex backend (default) must still replay encrypted reasoning."""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(),
|
|
{"role": "user", "content": "what's your name?"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(msgs)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1
|
|
assert reasoning[0]["encrypted_content"] == "enc_blob"
|
|
|
|
|
|
def test_codex_reasoning_replay_includes_encrypted_content_for_xai():
|
|
"""xAI must receive replayed encrypted reasoning items (May 2026 reversal).
|
|
|
|
Earlier we stripped these on the theory that the OAuth/SuperGrok
|
|
surface rejected them. xAI subsequently confirmed they explicitly
|
|
want Hermes to thread encrypted reasoning back across turns for
|
|
cross-turn coherence — that's the whole point of the partnership
|
|
integration.
|
|
"""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(),
|
|
{"role": "user", "content": "what's your name?"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(msgs, is_xai_responses=True)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1, (
|
|
"xAI must receive replayed reasoning items — see docstring for the "
|
|
"May 2026 reversal of the earlier suppression gate."
|
|
)
|
|
assert reasoning[0]["encrypted_content"] == "enc_blob"
|
|
|
|
# And the assistant's visible text must still be present alongside it.
|
|
assistant_items = [
|
|
it for it in items
|
|
if it.get("role") == "assistant" or it.get("type") == "message"
|
|
]
|
|
assert assistant_items, "assistant message must still be present"
|
|
|
|
|
|
def test_codex_transport_xai_request_includes_encrypted_content():
|
|
"""xAI ``include`` array must request ``reasoning.encrypted_content``.
|
|
|
|
This is the request-side half of the May 2026 reversal: we ask xAI
|
|
to echo back encrypted reasoning so the next turn can replay it.
|
|
"""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="grok-4.3",
|
|
messages=[
|
|
{"role": "system", "content": "you are a helpful assistant"},
|
|
{"role": "user", "content": "hi"},
|
|
],
|
|
tools=None,
|
|
instructions="you are a helpful assistant",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=True,
|
|
)
|
|
assert kwargs["include"] == ["reasoning.encrypted_content"]
|
|
|
|
|
|
def test_codex_transport_xai_replays_reasoning_in_input():
|
|
"""End-to-end: build_kwargs on xAI must replay prior encrypted reasoning."""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="grok-4.3",
|
|
messages=[
|
|
{"role": "system", "content": "sys"},
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(text="hi from grok"),
|
|
{"role": "user", "content": "what's your name?"},
|
|
],
|
|
tools=None,
|
|
instructions="sys",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=True,
|
|
)
|
|
input_items = kwargs["input"]
|
|
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
|
|
assert len(reasoning_items) == 1
|
|
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
|
|
|
|
|
|
def test_codex_transport_native_codex_still_replays_reasoning_in_input():
|
|
"""Regression guard: openai-codex must keep the existing replay path."""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
kwargs = transport.build_kwargs(
|
|
model="gpt-5-codex",
|
|
messages=[
|
|
{"role": "system", "content": "sys"},
|
|
{"role": "user", "content": "hi"},
|
|
_assistant_msg_with_encrypted_reasoning(text="hi from codex"),
|
|
{"role": "user", "content": "next"},
|
|
],
|
|
tools=None,
|
|
instructions="sys",
|
|
reasoning_config={"enabled": True, "effort": "medium"},
|
|
is_xai_responses=False,
|
|
)
|
|
input_items = kwargs["input"]
|
|
reasoning_items = [it for it in input_items if it.get("type") == "reasoning"]
|
|
assert len(reasoning_items) == 1
|
|
assert reasoning_items[0]["encrypted_content"] == "enc_blob"
|
|
# Native Codex still asks for encrypted_content back.
|
|
assert "reasoning.encrypted_content" in kwargs.get("include", [])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D: entitlement 403 must NOT trigger credential-pool refresh loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"message",
|
|
[
|
|
# The exact wire text RaidenTyler and Don Piedro captured.
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com",
|
|
# Permission-style variant from the same 403 body.
|
|
"The caller does not have permission to execute the specified "
|
|
"operation for grok-4.3",
|
|
],
|
|
)
|
|
def test_is_entitlement_failure_matches_real_xai_bodies(message):
|
|
from run_agent import AIAgent
|
|
|
|
assert AIAgent._is_entitlement_failure(
|
|
{"message": message, "reason": "permission_denied"},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_status_other_than_401_403():
|
|
"""200/429/500 must never be classified as entitlement, even if body matches."""
|
|
from run_agent import AIAgent
|
|
|
|
body = {
|
|
"message": "do not have an active Grok subscription",
|
|
}
|
|
assert not AIAgent._is_entitlement_failure(body, 500)
|
|
assert not AIAgent._is_entitlement_failure(body, 429)
|
|
assert not AIAgent._is_entitlement_failure(body, 200)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_unrelated_auth_errors():
|
|
"""A real auth failure (expired token, wrong key) must keep refreshing."""
|
|
from run_agent import AIAgent
|
|
|
|
# Generic Anthropic-style auth failure
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{"message": "Invalid API key", "reason": "authentication_error"},
|
|
401,
|
|
)
|
|
# OAuth token expired
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{"message": "Token has expired", "reason": "unauthorized"},
|
|
401,
|
|
)
|
|
# Empty context
|
|
assert not AIAgent._is_entitlement_failure({}, 401)
|
|
assert not AIAgent._is_entitlement_failure(None, 401)
|
|
|
|
|
|
def test_recover_with_credential_pool_skips_refresh_on_entitlement_403():
|
|
"""The recovery path must NOT call pool.try_refresh_current() on entitlement 403.
|
|
|
|
Before the fix, an unsubscribed xAI OAuth account would burn the agent
|
|
loop indefinitely: refresh → 403 → refresh → 403, infinitely. With
|
|
the entitlement guard, recovery returns False so the error surfaces
|
|
normally with the friendly hint from _summarize_api_error.
|
|
"""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
# Wire a fake credential pool that records refresh attempts.
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
error_context = {
|
|
"reason": "The caller does not have permission to execute the specified operation",
|
|
"message": "You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com",
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Entitlement 403 must surface, not silently recover"
|
|
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on entitlement 403"
|
|
|
|
|
|
def test_recover_with_credential_pool_skips_refresh_on_bare_403_for_xai_oauth():
|
|
"""A bare HTTP 403 from ``xai-oauth`` (no keyword match) must NOT loop refresh.
|
|
|
|
Regression for #26847 — xAI's backend has been seen to 403 standard
|
|
SuperGrok subscribers with a terser body that doesn't contain any of
|
|
the existing entitlement keywords ("do not have an active Grok
|
|
subscription", etc.). Before the defense-in-depth guard, the recovery
|
|
path would happily mint a fresh token, get a fresh 403, and spin.
|
|
"""
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
assert agent.provider == "xai-oauth"
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
error_context = {
|
|
"reason": "forbidden",
|
|
"message": "Forbidden",
|
|
}
|
|
assert not AIAgent._is_entitlement_failure(error_context, 403), (
|
|
"Pre-condition: bare 'Forbidden' body must NOT match the keyword "
|
|
"heuristic — otherwise this test isn't covering the defense-in-depth path."
|
|
)
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Bare 403 on xai-oauth must surface, not refresh-loop"
|
|
assert refresh_calls["n"] == 0, "try_refresh_current must NOT be called on xai-oauth 403"
|
|
|
|
|
|
def test_recover_with_credential_pool_still_refreshes_genuine_auth_failure():
|
|
"""Regression guard: legitimate auth errors must still trigger refresh."""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
# Return a fake refreshed entry — semantically "refresh worked"
|
|
entry = MagicMock()
|
|
entry.id = "entry_refreshed"
|
|
return entry
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
# _swap_credential is called by the recovery path — stub it out
|
|
agent._swap_credential = MagicMock()
|
|
|
|
error_context = {
|
|
"reason": "authentication_error",
|
|
"message": "Invalid API key",
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=401,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is True, "Genuine auth failure must still recover via refresh"
|
|
assert refresh_calls["n"] == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix D-bis: bad-credentials 403 must NOT be classified as entitlement (#29344)
|
|
#
|
|
# xAI returns the same permission-denied ``code`` text for two distinct
|
|
# conditions: unsubscribed account vs. stale OAuth access token. The
|
|
# ``error`` field's ``[WKE=unauthenticated:...]`` suffix (and the
|
|
# accompanying "OAuth2 access token could not be validated" phrasing) is
|
|
# xAI's authoritative disambiguator — when present, the body is an auth
|
|
# failure, not entitlement, and the credential-pool refresh path must
|
|
# run. Pre-fix, long-running TUI sessions stuck on a stale token
|
|
# surfaced as a non-retryable client error; the workaround was to exit
|
|
# and reopen the TUI so the startup-resolve path refreshed.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_bad_credentials_wke_suffix():
|
|
"""403 with ``[WKE=unauthenticated:bad-credentials]`` is auth, not entitlement.
|
|
|
|
Verbatim shape from the #29344 reporter — the ``code`` text matches
|
|
the entitlement permission-denied heuristic, but the ``error`` field
|
|
carries xAI's explicit "this is a credential validation failure"
|
|
signal. Classifier must honor it.
|
|
"""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_for_wke_suffix_in_normalized_shape():
|
|
"""The same body after ``_extract_api_error_context`` normalisation.
|
|
|
|
Real runtime paths feed the classifier through
|
|
``_extract_api_error_context``, which converts the raw body to
|
|
``{message, reason, reset_at}``. The disambiguator must fire in
|
|
BOTH the raw-body shape (test above) and the normalised shape so
|
|
the fix actually reaches the production call site at
|
|
``_recover_with_credential_pool``.
|
|
"""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"reason": "The caller does not have permission to execute the specified operation",
|
|
"message": "The OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize("wke_variant", [
|
|
# The headline variant — what xAI returns today.
|
|
"[WKE=unauthenticated:bad-credentials]",
|
|
# Forward-compat: xAI documents the WKE prefix as a stable shape,
|
|
# the suffix after the colon is the "reason code" and could grow
|
|
# new values. Anything under ``unauthenticated:`` must route to
|
|
# the refresh path.
|
|
"[WKE=unauthenticated:expired-token]",
|
|
"[WKE=unauthenticated:revoked]",
|
|
"[WKE=unauthenticated:some-future-reason]",
|
|
])
|
|
def test_is_entitlement_failure_false_for_any_wke_unauthenticated_variant(wke_variant):
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": f"Token rejected. {wke_variant}",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_false_via_oauth2_validation_phrase_alone():
|
|
"""Second disambiguator: the "OAuth2 access token could not be
|
|
validated" phrase by itself (no WKE suffix) must also route to
|
|
refresh. This is a belt-and-braces guard against xAI dropping or
|
|
reformatting the WKE suffix in a future API revision without
|
|
changing the human-readable error text."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "The OAuth2 access token could not be validated.",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_wke_signal_overrides_entitlement_keywords():
|
|
"""Defensive: if a future xAI body somehow carries BOTH the WKE
|
|
suffix AND entitlement language, the WKE signal wins. Auth is
|
|
recoverable; entitlement isn't. If the refreshed token still
|
|
can't access the resource, the next 403 (without WKE) lands on
|
|
the entitlement path correctly."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": (
|
|
"do not have an active Grok subscription. "
|
|
"[WKE=unauthenticated:bad-credentials]"
|
|
),
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_is_entitlement_failure_case_insensitive_wke_match():
|
|
"""Substring match is case-insensitive — the classifier lowercases
|
|
everything before matching, so a future xAI build that uppercases
|
|
the prefix wouldn't reintroduce the misclassification."""
|
|
from run_agent import AIAgent
|
|
|
|
assert not AIAgent._is_entitlement_failure(
|
|
{
|
|
"code": "The caller does not have permission to execute the specified operation",
|
|
"error": "[wke=Unauthenticated:Bad-Credentials]",
|
|
},
|
|
403,
|
|
)
|
|
|
|
|
|
def test_recover_with_credential_pool_refreshes_on_xai_bad_credentials_403():
|
|
"""End-to-end #29344: a bad-credentials 403 from xai-oauth MUST
|
|
call ``try_refresh_current()`` so the long-running TUI session
|
|
recovers without an exit/reopen cycle.
|
|
|
|
Mirrors the scaffolding of
|
|
``test_recover_with_credential_pool_still_refreshes_genuine_auth_failure``
|
|
but with the exact 403 body shape xAI ships for stale tokens —
|
|
the very body that pre-fix tripped the entitlement classifier
|
|
and short-circuited the refresh path.
|
|
"""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
entry = MagicMock()
|
|
entry.id = "entry_refreshed_after_stale"
|
|
return entry
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
agent._swap_credential = MagicMock()
|
|
|
|
# Normalised shape that ``_extract_api_error_context`` would
|
|
# produce for the reporter's wire-level body.
|
|
error_context = {
|
|
"reason": (
|
|
"The caller does not have permission to execute the specified operation"
|
|
),
|
|
"message": (
|
|
"The OAuth2 access token could not be validated. "
|
|
"[WKE=unauthenticated:bad-credentials]"
|
|
),
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is True, (
|
|
"Stale OAuth token (bad-credentials 403) must trigger refresh — "
|
|
"pre-fix this returned False because the entitlement classifier "
|
|
"over-matched on the permission-denied code text"
|
|
)
|
|
assert refresh_calls["n"] == 1, "try_refresh_current must run exactly once"
|
|
agent._swap_credential.assert_called_once()
|
|
|
|
|
|
def test_recover_with_credential_pool_still_blocks_real_entitlement():
|
|
"""Companion regression guard for the #29344 fix: the original
|
|
#26847 protection — entitlement 403 must NOT refresh — must
|
|
survive the new disambiguator. A real unsubscribed-account body
|
|
has no WKE suffix and no OAuth2-validation phrase, so the
|
|
classifier still classifies it as entitlement and short-circuits."""
|
|
from agent.error_classifier import FailoverReason
|
|
|
|
agent = _make_codex_agent()
|
|
|
|
refresh_calls = {"n": 0}
|
|
|
|
class _FakePool:
|
|
def try_refresh_current(self):
|
|
refresh_calls["n"] += 1
|
|
return MagicMock(id="should_not_be_called")
|
|
|
|
def mark_exhausted_and_rotate(self, **_kwargs):
|
|
return None
|
|
|
|
def has_available(self):
|
|
return False
|
|
|
|
agent._credential_pool = _FakePool()
|
|
|
|
# Pure entitlement body — no WKE suffix, no OAuth2 phrase.
|
|
error_context = {
|
|
"reason": (
|
|
"The caller does not have permission to execute the specified operation"
|
|
),
|
|
"message": (
|
|
"You have either run out of available resources or do not have an "
|
|
"active Grok subscription. Manage at https://grok.com"
|
|
),
|
|
}
|
|
|
|
recovered, _retried_429 = agent._recover_with_credential_pool(
|
|
status_code=403,
|
|
has_retried_429=False,
|
|
classified_reason=FailoverReason.auth,
|
|
error_context=error_context,
|
|
)
|
|
|
|
assert recovered is False, "Entitlement 403 must surface, not refresh"
|
|
assert refresh_calls["n"] == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix E: grok-4.3 context length must be 1M, not 256K
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_grok_4_3_context_length_is_1m():
|
|
"""grok-4.3 ships with 1M context per docs.x.ai/developers/models/grok-4.3.
|
|
|
|
Hermes' substring-match fallback used to return 256k (from the
|
|
"grok-4" catch-all) which under-reported the model's real capacity.
|
|
"""
|
|
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
|
|
|
|
# The entry exists with the expected value.
|
|
assert DEFAULT_CONTEXT_LENGTHS["grok-4.3"] == 1_000_000
|
|
|
|
# And longest-first substring matching resolves grok-4.3 and
|
|
# grok-4.3-latest to the new value, NOT the grok-4 catch-all.
|
|
for slug in ("grok-4.3", "grok-4.3-latest"):
|
|
matched_key = max(
|
|
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
|
|
key=len,
|
|
)
|
|
assert matched_key == "grok-4.3", (
|
|
f"Expected longest-first match to land on grok-4.3 for {slug}, "
|
|
f"got {matched_key}"
|
|
)
|
|
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 1_000_000
|
|
|
|
|
|
def test_grok_4_still_resolves_to_256k():
|
|
"""Regression guard: grok-4 (non-.3) must still resolve to 256k."""
|
|
from agent.model_metadata import DEFAULT_CONTEXT_LENGTHS
|
|
|
|
for slug in ("grok-4", "grok-4-0709"):
|
|
matched_key = max(
|
|
(k for k in DEFAULT_CONTEXT_LENGTHS if k in slug.lower()),
|
|
key=len,
|
|
)
|
|
# grok-4-0709 contains "grok-4" but not "grok-4.3"; matched key
|
|
# must be "grok-4" (or a more specific variant family if one is
|
|
# ever added). The 256k contract must hold.
|
|
assert DEFAULT_CONTEXT_LENGTHS[matched_key] == 256_000
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cross-issuer reasoning replay guard
|
|
#
|
|
# When a session switches model providers mid-conversation (e.g. user runs
|
|
# /model gpt-5.5 after several turns on grok-4.3), the persisted reasoning
|
|
# items carry encrypted_content that only the issuing endpoint can decrypt.
|
|
# Replaying them against the new endpoint deterministically returns HTTP 400
|
|
# invalid_encrypted_content and breaks every subsequent turn. The cross-issuer
|
|
# guard stamps each reasoning item with its issuer on normalize and drops
|
|
# foreign-issuer items on replay.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _stamped_assistant_msg(issuer_kind, *, text="hi", encrypted="enc_blob", rs_id="rs_001"):
|
|
return {
|
|
"role": "assistant",
|
|
"content": text,
|
|
"codex_reasoning_items": [
|
|
{
|
|
"type": "reasoning",
|
|
"id": rs_id,
|
|
"encrypted_content": encrypted,
|
|
"summary": [],
|
|
"_issuer_kind": issuer_kind,
|
|
}
|
|
],
|
|
}
|
|
|
|
|
|
def test_cross_issuer_reasoning_is_dropped_on_replay():
|
|
"""Reasoning minted by one Responses endpoint must not be replayed to
|
|
another. This is the regression for the chatgpt-backend vs xAI-OAuth
|
|
swap that returned invalid_encrypted_content on every turn after the
|
|
user changed model mid-session.
|
|
"""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
|
|
{"role": "user", "content": "next"},
|
|
]
|
|
|
|
# Calling against codex_backend — the grok-issued blob must be dropped.
|
|
items = _chat_messages_to_responses_input(
|
|
msgs, current_issuer_kind="codex_backend"
|
|
)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert reasoning == [], (
|
|
"Reasoning items stamped with a foreign _issuer_kind must be dropped "
|
|
"before the API rejects the whole request with invalid_encrypted_content."
|
|
)
|
|
|
|
|
|
def test_same_issuer_reasoning_is_still_replayed():
|
|
"""Same-endpoint reasoning replay is the documented happy path (May 2026
|
|
reversal). The cross-issuer guard must not regress it.
|
|
"""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
|
|
{"role": "user", "content": "next"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(
|
|
msgs, current_issuer_kind="xai_responses"
|
|
)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1
|
|
assert reasoning[0]["encrypted_content"] == "grok_blob"
|
|
# The internal stamp must not leak to the API payload.
|
|
assert "_issuer_kind" not in reasoning[0]
|
|
|
|
|
|
def test_unstamped_reasoning_is_replayed_for_backwards_compat():
|
|
"""Reasoning items persisted before this patch don't carry _issuer_kind.
|
|
They must still be replayed (legacy-compatible behaviour).
|
|
"""
|
|
from agent.codex_responses_adapter import _chat_messages_to_responses_input
|
|
|
|
msgs = [
|
|
{"role": "user", "content": "hi"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "hello",
|
|
"codex_reasoning_items": [
|
|
{
|
|
"type": "reasoning",
|
|
"id": "rs_legacy",
|
|
"encrypted_content": "legacy_blob",
|
|
"summary": [],
|
|
}
|
|
],
|
|
},
|
|
{"role": "user", "content": "next"},
|
|
]
|
|
|
|
items = _chat_messages_to_responses_input(
|
|
msgs, current_issuer_kind="codex_backend"
|
|
)
|
|
reasoning = [it for it in items if it.get("type") == "reasoning"]
|
|
assert len(reasoning) == 1
|
|
assert reasoning[0]["encrypted_content"] == "legacy_blob"
|
|
|
|
|
|
def test_normalize_codex_response_stamps_issuer_on_reasoning():
|
|
"""Reasoning captured from a response must be stamped with the issuer so
|
|
a later replay against a different endpoint can drop it.
|
|
"""
|
|
from types import SimpleNamespace
|
|
|
|
from agent.codex_responses_adapter import _normalize_codex_response
|
|
|
|
reasoning_item = SimpleNamespace(
|
|
type="reasoning",
|
|
id="rs_new",
|
|
encrypted_content="fresh_blob",
|
|
summary=[],
|
|
)
|
|
message_item = SimpleNamespace(
|
|
type="message",
|
|
role="assistant",
|
|
status="completed",
|
|
content=[SimpleNamespace(type="output_text", text="ok")],
|
|
id="msg_1",
|
|
)
|
|
response = SimpleNamespace(output=[reasoning_item, message_item], status="completed")
|
|
|
|
msg, _ = _normalize_codex_response(response, issuer_kind="xai_responses")
|
|
assert msg.codex_reasoning_items and len(msg.codex_reasoning_items) == 1
|
|
assert msg.codex_reasoning_items[0]["_issuer_kind"] == "xai_responses"
|
|
assert msg.codex_reasoning_items[0]["encrypted_content"] == "fresh_blob"
|
|
|
|
|
|
def test_transport_round_trip_drops_foreign_reasoning():
|
|
"""Full transport flow: build_kwargs against codex_backend after grok turns
|
|
must produce an `input` array that contains zero foreign reasoning items.
|
|
"""
|
|
from agent.transports.codex import ResponsesApiTransport
|
|
|
|
transport = ResponsesApiTransport()
|
|
messages = [
|
|
{"role": "system", "content": "you are hermes"},
|
|
{"role": "user", "content": "hi"},
|
|
_stamped_assistant_msg("xai_responses", encrypted="grok_blob"),
|
|
{"role": "user", "content": "엑스다임 프로젝트 파악, 스킬로 정리."},
|
|
]
|
|
|
|
kwargs = transport.build_kwargs(
|
|
model="gpt-5.5",
|
|
messages=messages,
|
|
tools=None,
|
|
is_codex_backend=True,
|
|
is_xai_responses=False,
|
|
is_github_responses=False,
|
|
base_url="https://chatgpt.com/backend-api/codex",
|
|
instructions="you are hermes",
|
|
)
|
|
|
|
reasoning = [it for it in kwargs["input"] if it.get("type") == "reasoning"]
|
|
assert reasoning == [], (
|
|
"Cross-issuer reasoning leaked through build_kwargs — this is the "
|
|
"exact regression that broke session 40de1ae0 on 2026-05-25 01:09."
|
|
)
|