From 8b3cb930c9d06053b8fa9f07fd36c25e1796381d Mon Sep 17 00:00:00 2001 From: xxxigm Date: Wed, 20 May 2026 21:46:42 +0700 Subject: [PATCH] fix(xai-oauth): honor [WKE=unauthenticated:...] disambiguator in entitlement classifier (#29344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``_is_entitlement_failure`` over-matched on xAI 403s. xAI returns the same permission-denied ``code`` text for two distinct conditions: 1. Unsubscribed account ("active Grok subscription. Manage at https://grok.com" in the ``error`` field). 2. Stale OAuth access token ("OAuth2 access token could not be validated. [WKE=unauthenticated:bad-credentials]" in the ``error`` field). The classifier's "does not have permission + grok" substring heuristic treated both identically, so the credential-pool refresh path was short-circuited for case (2) — long-running TUI sessions stuck on a stale OAuth token surfaced a non-retryable client error and the user had to exit + reopen the TUI to recover (the startup-resolve path bypasses the classifier entirely, which is why bridge adapters with proactive refresh cadences didn't see this in practice). This patch adopts the reporter's recommended fix (option 1, tightest): honor xAI's explicit ``[WKE=unauthenticated:...]`` suffix and the ``OAuth2 access token could not be validated`` phrasing as authoritative "this is auth, not entitlement" signals. When either appears anywhere in the body's text fields, the classifier returns False eagerly — *before* the entitlement keyword checks run — so the refresh-on-401 path takes over and the existing loop-protection still guards against runaway refresh storms if the refresh itself fails. Two small adjustments fall out of this: * The haystack now also covers ``code`` and ``error`` keys directly, not just the ``message``/``reason`` shape ``_extract_api_error_context`` produces. Real runtime paths use the normalised shape, but the test suite and any future call sites that pass raw bodies get the same treatment. Backwards compatible: missing keys default to empty strings, the haystack still skips when everything is blank. * Both disambiguator checks fire BEFORE the entitlement keyword checks. If a future xAI body somehow lands with both an entitlement message AND the WKE suffix, the WKE suffix wins (correct — auth is recoverable; entitlement is not, and a refreshed token will surface the entitlement message on the next request anyway). Existing tests (``test_is_entitlement_failure_matches_real_xai_bodies``, ``test_is_entitlement_failure_false_for_unrelated_auth_errors``, ``test_recover_with_credential_pool_skips_refresh_on_entitlement_403``, ``test_recover_with_credential_pool_still_refreshes_genuine_auth_failure``) continue to pass unchanged — the unsubscribed-account path, the generic auth-error path, and the refresh-on-401 path are all left intact. --- run_agent.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/run_agent.py b/run_agent.py index da121869f8d..b364127c278 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1368,6 +1368,18 @@ class AIAgent: * xAI OAuth: "do not have an active Grok subscription" / "out of available resources" / "does not have permission" + "grok" + Disambiguator for xAI (#29344): the same ``code`` text ("The caller + does not have permission to execute the specified operation") is + returned for BOTH an unsubscribed account AND a stale OAuth access + token. xAI ships an explicit signal in the ``error`` field that + tells the two apart: a ``[WKE=unauthenticated:...]`` suffix (and/or + the ``OAuth2 access token could not be validated`` phrasing) means + the credentials failed validation — that's recoverable by refreshing + the token, NOT by surfacing an entitlement message. When either + signal is present we return False eagerly so the credential-pool + refresh path runs, letting long-running TUI sessions recover from + stale tokens without an exit/reopen cycle. + Extend here for new providers as we discover them (Anthropic's Claude Max OAuth entitlement errors look distinct enough today that the existing 1M-context-beta branch handles them; revisit if other @@ -1377,11 +1389,29 @@ class AIAgent: return False if not isinstance(error_context, dict): return False + # Build a single lowercase haystack covering every field shape the + # body might land in. ``_extract_api_error_context`` normalises to + # ``message``/``reason``, but callers (and the test suite) may also + # hand us the raw body with ``code``/``error`` keys; cover both so + # the WKE disambiguator below fires regardless of entry point. message = str(error_context.get("message") or "").lower() reason = str(error_context.get("reason") or "").lower() - haystack = f"{message} {reason}" + code = str(error_context.get("code") or "").lower() + err = str(error_context.get("error") or "").lower() + haystack = f"{message} {reason} {code} {err}" if not haystack.strip(): return False + # xAI's authoritative disambiguator for "stale token" vs + # "unsubscribed account". Both conditions share the same + # permission-denied ``code`` text; only one carries this suffix. + # Bail out before the entitlement keyword checks so a stale OAuth + # token routes through the credential-refresh path instead of the + # surface-error-as-entitlement path. See #29344 for the long- + # running TUI failure mode this closes. + if "[wke=unauthenticated:" in haystack: + return False + if "oauth2 access token could not be validated" in haystack: + return False if "do not have an active grok subscription" in haystack: return True if "out of available resources" in haystack and "grok" in haystack: