fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions

When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403, token revoked or reused), _mark_exhausted was called but auth.json was left with the dead credentials. On the next session, _seed_from_singletons re-read auth.json and re-seeded the pool with the same revoked token, triggering the same terminal failure in a loop. Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine block in _refresh_entry: when a terminal error is detected and auth.json holds no newer tokens, clear access_token/refresh_token from auth.json and remove all device_code-sourced pool entries from memory. Mirrors the Nous quarantine added in c90556262 and the xAI quarantine in #28116. Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure, matching the xAI and Nous patterns, to avoid refresh_token_reused races when multiple Hermes processes share the same auth.json singleton. Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems); fix re-applied surgically on current main with their predicate and tests preserved.
2026-06-06 07:51:53 +00:00 · 2026-05-18 10:31:13 -07:00 · 2026-05-18 10:31:13 -07:00 · b570e0fdd0
commit b570e0fdd0
parent 9aae59feab
3 changed files with 237 additions and 0 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@ -797,6 +797,13 @@ class CredentialPool:
                    except Exception as wexc:
                        logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
            elif self.provider == "openai-codex":
+                # Adopt fresher tokens from auth.json before spending the
+                # refresh_token — single-use tokens consumed by another Hermes
+                # process sharing the same auth.json singleton would otherwise
+                # trigger ``refresh_token_reused`` on the next POST.
+                synced = self._sync_codex_entry_from_auth_store(entry)
+                if synced is not entry:
+                    entry = synced
                refreshed = auth_mod.refresh_codex_oauth_pure(
                    entry.access_token,
                    entry.refresh_token,
@ -951,6 +958,72 @@ class CredentialPool:
                        self._current_id = None
                    self._persist()
                    return None
+            # For openai-codex: same race as xAI/nous — another Hermes process
+            # may have consumed the refresh token between our proactive sync
+            # and the HTTP call.  Re-check auth.json and adopt the fresh tokens
+            # if they have rotated since.
+            if self.provider == "openai-codex":
+                synced = self._sync_codex_entry_from_auth_store(entry)
+                if synced.refresh_token != entry.refresh_token:
+                    logger.debug(
+                        "Codex OAuth refresh failed but auth.json has newer tokens — adopting"
+                    )
+                    updated = replace(
+                        synced,
+                        last_status=STATUS_OK,
+                        last_status_at=None,
+                        last_error_code=None,
+                        last_error_reason=None,
+                        last_error_message=None,
+                        last_error_reset_at=None,
+                    )
+                    self._replace_entry(synced, updated)
+                    self._persist()
+                    return updated
+                # Terminal error: auth.json has no newer tokens — the stored
+                # refresh_token is dead.  Clear it from auth.json so the next
+                # session does not re-seed the same revoked credentials, and
+                # remove all singleton-seeded (device_code) entries from the
+                # in-memory pool.  Mirrors the xAI and Nous quarantine paths.
+                if auth_mod._is_terminal_codex_oauth_refresh_error(exc):
+                    logger.debug(
+                        "Codex OAuth refresh token is terminally invalid; clearing local token state"
+                    )
+                    try:
+                        with _auth_store_lock():
+                            auth_store = _load_auth_store()
+                            state = _load_provider_state(auth_store, "openai-codex") or {}
+                            if isinstance(state, dict):
+                                tokens = state.get("tokens") or {}
+                                if isinstance(tokens, dict):
+                                    store_refresh = str(tokens.get("refresh_token") or "").strip()
+                                    entry_refresh = str(entry.refresh_token or "").strip()
+                                    if not store_refresh or store_refresh == entry_refresh:
+                                        tokens.pop("access_token", None)
+                                        tokens.pop("refresh_token", None)
+                                        state["tokens"] = tokens
+                                        state["last_auth_error"] = {
+                                            "provider": "openai-codex",
+                                            "code": getattr(exc, "code", "unknown"),
+                                            "message": str(exc),
+                                            "reason": "credential_pool_refresh_failure",
+                                            "relogin_required": True,
+                                            "at": datetime.now(timezone.utc).isoformat(),
+                                        }
+                                        _save_provider_state(auth_store, "openai-codex", state)
+                                        _save_auth_store(auth_store)
+                    except Exception as clear_exc:
+                        logger.debug(
+                            "Failed to clear terminal Codex OAuth state: %s", clear_exc
+                        )
+                    self._entries = [
+                        item for item in self._entries
+                        if item.source != "device_code"
+                    ]
+                    if self._current_id == entry.id:
+                        self._current_id = None
+                    self._persist()
+                    return None
            # For nous: another process may have consumed the refresh token
            # between our proactive sync and the HTTP call.  Re-sync from
            # auth.json and adopt the fresh tokens if available.