From b570e0fdd0d64742fd96c928821ecc0cf5d91ef7 Mon Sep 17 00:00:00 2001 From: EloquentBrush0x <283442588+EloquentBrush0x@users.noreply.github.com> Date: Mon, 18 May 2026 10:31:13 -0700 Subject: [PATCH] fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403, token revoked or reused), _mark_exhausted was called but auth.json was left with the dead credentials. On the next session, _seed_from_singletons re-read auth.json and re-seeded the pool with the same revoked token, triggering the same terminal failure in a loop. Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine block in _refresh_entry: when a terminal error is detected and auth.json holds no newer tokens, clear access_token/refresh_token from auth.json and remove all device_code-sourced pool entries from memory. Mirrors the Nous quarantine added in c90556262 and the xAI quarantine in #28116. Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure, matching the xAI and Nous patterns, to avoid refresh_token_reused races when multiple Hermes processes share the same auth.json singleton. Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems); fix re-applied surgically on current main with their predicate and tests preserved. --- agent/credential_pool.py | 73 ++++++++++++++ hermes_cli/auth.py | 23 +++++ tests/agent/test_credential_pool.py | 141 ++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+) diff --git a/agent/credential_pool.py b/agent/credential_pool.py index 416f6016653..9a5cc20fe6f 100644 --- a/agent/credential_pool.py +++ b/agent/credential_pool.py @@ -797,6 +797,13 @@ class CredentialPool: except Exception as wexc: logger.debug("Failed to write refreshed token to credentials file: %s", wexc) elif self.provider == "openai-codex": + # Adopt fresher tokens from auth.json before spending the + # refresh_token — single-use tokens consumed by another Hermes + # process sharing the same auth.json singleton would otherwise + # trigger ``refresh_token_reused`` on the next POST. + synced = self._sync_codex_entry_from_auth_store(entry) + if synced is not entry: + entry = synced refreshed = auth_mod.refresh_codex_oauth_pure( entry.access_token, entry.refresh_token, @@ -951,6 +958,72 @@ class CredentialPool: self._current_id = None self._persist() return None + # For openai-codex: same race as xAI/nous — another Hermes process + # may have consumed the refresh token between our proactive sync + # and the HTTP call. Re-check auth.json and adopt the fresh tokens + # if they have rotated since. + if self.provider == "openai-codex": + synced = self._sync_codex_entry_from_auth_store(entry) + if synced.refresh_token != entry.refresh_token: + logger.debug( + "Codex OAuth refresh failed but auth.json has newer tokens — adopting" + ) + updated = replace( + synced, + last_status=STATUS_OK, + last_status_at=None, + last_error_code=None, + last_error_reason=None, + last_error_message=None, + last_error_reset_at=None, + ) + self._replace_entry(synced, updated) + self._persist() + return updated + # Terminal error: auth.json has no newer tokens — the stored + # refresh_token is dead. Clear it from auth.json so the next + # session does not re-seed the same revoked credentials, and + # remove all singleton-seeded (device_code) entries from the + # in-memory pool. Mirrors the xAI and Nous quarantine paths. + if auth_mod._is_terminal_codex_oauth_refresh_error(exc): + logger.debug( + "Codex OAuth refresh token is terminally invalid; clearing local token state" + ) + try: + with _auth_store_lock(): + auth_store = _load_auth_store() + state = _load_provider_state(auth_store, "openai-codex") or {} + if isinstance(state, dict): + tokens = state.get("tokens") or {} + if isinstance(tokens, dict): + store_refresh = str(tokens.get("refresh_token") or "").strip() + entry_refresh = str(entry.refresh_token or "").strip() + if not store_refresh or store_refresh == entry_refresh: + tokens.pop("access_token", None) + tokens.pop("refresh_token", None) + state["tokens"] = tokens + state["last_auth_error"] = { + "provider": "openai-codex", + "code": getattr(exc, "code", "unknown"), + "message": str(exc), + "reason": "credential_pool_refresh_failure", + "relogin_required": True, + "at": datetime.now(timezone.utc).isoformat(), + } + _save_provider_state(auth_store, "openai-codex", state) + _save_auth_store(auth_store) + except Exception as clear_exc: + logger.debug( + "Failed to clear terminal Codex OAuth state: %s", clear_exc + ) + self._entries = [ + item for item in self._entries + if item.source != "device_code" + ] + if self._current_id == entry.id: + self._current_id = None + self._persist() + return None # For nous: another process may have consumed the refresh token # between our proactive sync and the HTTP call. Re-sync from # auth.json and adopt the fresh tokens if available. diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index f223e101b15..d06e9a739ea 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -4061,6 +4061,29 @@ def _is_terminal_xai_oauth_refresh_error(exc: Exception) -> bool: ) +def _is_terminal_codex_oauth_refresh_error(exc: Exception) -> bool: + """True when retrying the same Codex OAuth refresh token cannot succeed. + + ``codex_refresh_failed`` covers HTTP 400/401/403 from the token endpoint + (invalid_grant, token revoked, refresh_token_reused). + ``codex_auth_missing_refresh_token`` means the pool entry has no refresh + token at all — retrying will never work. + Both carry ``relogin_required=True``; transient failures (429, 5xx) do not. + """ + return ( + isinstance(exc, AuthError) + and exc.provider == "openai-codex" + and exc.code in { + "codex_refresh_failed", + "codex_auth_missing_refresh_token", + "invalid_grant", + "invalid_token", + "refresh_token_reused", + } + and bool(exc.relogin_required) + ) + + def _quarantine_nous_oauth_state( state: Dict[str, Any], error: AuthError, diff --git a/tests/agent/test_credential_pool.py b/tests/agent/test_credential_pool.py index 034dc7377ca..bcb1ed595dd 100644 --- a/tests/agent/test_credential_pool.py +++ b/tests/agent/test_credential_pool.py @@ -1963,3 +1963,144 @@ def test_xai_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {}) assert tokens.get("access_token") == "old-access-token" assert tokens.get("refresh_token") == "old-refresh-token" + + +# --------------------------------------------------------------------------- +# Codex OAuth terminal error quarantine +# --------------------------------------------------------------------------- + + +def _codex_auth_store(access_token: str, refresh_token: str) -> dict: + return { + "version": 1, + "active_provider": "openai-codex", + "providers": { + "openai-codex": { + "tokens": { + "access_token": access_token, + "refresh_token": refresh_token, + }, + } + }, + } + + +def test_is_terminal_codex_oauth_refresh_error(): + from hermes_cli.auth import AuthError, _is_terminal_codex_oauth_refresh_error + + assert _is_terminal_codex_oauth_refresh_error( + AuthError("Refresh failed", provider="openai-codex", code="codex_refresh_failed", relogin_required=True) + ) + assert _is_terminal_codex_oauth_refresh_error( + AuthError("No token", provider="openai-codex", code="codex_auth_missing_refresh_token", relogin_required=True) + ) + assert _is_terminal_codex_oauth_refresh_error( + AuthError("Revoked", provider="openai-codex", code="invalid_grant", relogin_required=True) + ) + assert _is_terminal_codex_oauth_refresh_error( + AuthError("Reused", provider="openai-codex", code="refresh_token_reused", relogin_required=True) + ) + # transient 429/5xx: relogin_required=False -> not terminal + assert not _is_terminal_codex_oauth_refresh_error( + AuthError("Rate limit", provider="openai-codex", code="codex_refresh_failed", relogin_required=False) + ) + # xAI error does not trigger Codex check + assert not _is_terminal_codex_oauth_refresh_error( + AuthError("Revoked", provider="xai-oauth", code="xai_refresh_failed", relogin_required=True) + ) + # Generic exception + assert not _is_terminal_codex_oauth_refresh_error(ValueError("oops")) + + +def test_codex_oauth_terminal_refresh_clears_auth_json_and_removes_pool_entries( + tmp_path, monkeypatch +): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False) + + _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token")) + + from agent.credential_pool import PooledCredential, load_pool + import hermes_cli.auth as auth_mod + from hermes_cli.auth import AuthError + + pool = load_pool("openai-codex") + selected = pool.select() + assert selected is not None + assert selected.source == "device_code" + + # Add a manual API-key entry that must survive the quarantine. + pool.add_entry(PooledCredential.from_dict("openai-codex", { + "id": "manual-key", + "source": "manual", + "auth_type": "api_key", + "access_token": "manual-codex-key", + })) + + refresh_calls = {"count": 0} + + def _terminal_refresh_failure(*_args, **_kwargs): + refresh_calls["count"] += 1 + raise AuthError( + "Refresh session has been revoked", + provider="openai-codex", + code="codex_refresh_failed", + relogin_required=True, + ) + + monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _terminal_refresh_failure) + + assert pool.try_refresh_current() is None + + # Only the manual entry survives. + assert [entry.id for entry in pool.entries()] == ["manual-key"] + + # Auth.json tokens must be cleared. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + codex_state = auth_payload["providers"]["openai-codex"] + tokens = codex_state.get("tokens", {}) + assert not tokens.get("access_token") + assert not tokens.get("refresh_token") + assert codex_state["last_auth_error"]["code"] == "codex_refresh_failed" + assert codex_state["last_auth_error"]["relogin_required"] is True + + # Persisted pool must also have only the manual entry. + assert [entry["id"] for entry in auth_payload["credential_pool"]["openai-codex"]] == ["manual-key"] + + # A second try_refresh_current must not call refresh_codex_oauth_pure again. + assert pool.try_refresh_current() is None + assert refresh_calls["count"] == 1 + + +def test_codex_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes")) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False) + + _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token")) + + from agent.credential_pool import load_pool + import hermes_cli.auth as auth_mod + from hermes_cli.auth import AuthError + + pool = load_pool("openai-codex") + assert pool.select() is not None + + def _transient_failure(*_args, **_kwargs): + raise AuthError( + "Rate limited", + provider="openai-codex", + code="codex_refresh_failed", + relogin_required=False, + ) + + monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _transient_failure) + + pool.try_refresh_current() + + # Tokens must NOT be cleared from auth.json. + auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text()) + tokens = auth_payload["providers"]["openai-codex"].get("tokens", {}) + assert tokens.get("access_token") == "old-access-token" + assert tokens.get("refresh_token") == "old-refresh-token"