fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions

When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403,
token revoked or reused), _mark_exhausted was called but auth.json was left with
the dead credentials. On the next session, _seed_from_singletons re-read
auth.json and re-seeded the pool with the same revoked token, triggering the
same terminal failure in a loop.

Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine
block in _refresh_entry: when a terminal error is detected and auth.json holds
no newer tokens, clear access_token/refresh_token from auth.json and remove all
device_code-sourced pool entries from memory. Mirrors the Nous quarantine added
in c90556262 and the xAI quarantine in #28116.

Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure,
matching the xAI and Nous patterns, to avoid refresh_token_reused races when
multiple Hermes processes share the same auth.json singleton.

Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely
stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems);
fix re-applied surgically on current main with their predicate and tests preserved.
This commit is contained in:
EloquentBrush0x 2026-05-18 10:31:13 -07:00 committed by Teknium
parent 9aae59feab
commit b570e0fdd0
3 changed files with 237 additions and 0 deletions

View file

@ -1963,3 +1963,144 @@ def test_xai_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch
tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {})
assert tokens.get("access_token") == "old-access-token"
assert tokens.get("refresh_token") == "old-refresh-token"
# ---------------------------------------------------------------------------
# Codex OAuth terminal error quarantine
# ---------------------------------------------------------------------------
def _codex_auth_store(access_token: str, refresh_token: str) -> dict:
return {
"version": 1,
"active_provider": "openai-codex",
"providers": {
"openai-codex": {
"tokens": {
"access_token": access_token,
"refresh_token": refresh_token,
},
}
},
}
def test_is_terminal_codex_oauth_refresh_error():
from hermes_cli.auth import AuthError, _is_terminal_codex_oauth_refresh_error
assert _is_terminal_codex_oauth_refresh_error(
AuthError("Refresh failed", provider="openai-codex", code="codex_refresh_failed", relogin_required=True)
)
assert _is_terminal_codex_oauth_refresh_error(
AuthError("No token", provider="openai-codex", code="codex_auth_missing_refresh_token", relogin_required=True)
)
assert _is_terminal_codex_oauth_refresh_error(
AuthError("Revoked", provider="openai-codex", code="invalid_grant", relogin_required=True)
)
assert _is_terminal_codex_oauth_refresh_error(
AuthError("Reused", provider="openai-codex", code="refresh_token_reused", relogin_required=True)
)
# transient 429/5xx: relogin_required=False -> not terminal
assert not _is_terminal_codex_oauth_refresh_error(
AuthError("Rate limit", provider="openai-codex", code="codex_refresh_failed", relogin_required=False)
)
# xAI error does not trigger Codex check
assert not _is_terminal_codex_oauth_refresh_error(
AuthError("Revoked", provider="xai-oauth", code="xai_refresh_failed", relogin_required=True)
)
# Generic exception
assert not _is_terminal_codex_oauth_refresh_error(ValueError("oops"))
def test_codex_oauth_terminal_refresh_clears_auth_json_and_removes_pool_entries(
tmp_path, monkeypatch
):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
_write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
from agent.credential_pool import PooledCredential, load_pool
import hermes_cli.auth as auth_mod
from hermes_cli.auth import AuthError
pool = load_pool("openai-codex")
selected = pool.select()
assert selected is not None
assert selected.source == "device_code"
# Add a manual API-key entry that must survive the quarantine.
pool.add_entry(PooledCredential.from_dict("openai-codex", {
"id": "manual-key",
"source": "manual",
"auth_type": "api_key",
"access_token": "manual-codex-key",
}))
refresh_calls = {"count": 0}
def _terminal_refresh_failure(*_args, **_kwargs):
refresh_calls["count"] += 1
raise AuthError(
"Refresh session has been revoked",
provider="openai-codex",
code="codex_refresh_failed",
relogin_required=True,
)
monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _terminal_refresh_failure)
assert pool.try_refresh_current() is None
# Only the manual entry survives.
assert [entry.id for entry in pool.entries()] == ["manual-key"]
# Auth.json tokens must be cleared.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
codex_state = auth_payload["providers"]["openai-codex"]
tokens = codex_state.get("tokens", {})
assert not tokens.get("access_token")
assert not tokens.get("refresh_token")
assert codex_state["last_auth_error"]["code"] == "codex_refresh_failed"
assert codex_state["last_auth_error"]["relogin_required"] is True
# Persisted pool must also have only the manual entry.
assert [entry["id"] for entry in auth_payload["credential_pool"]["openai-codex"]] == ["manual-key"]
# A second try_refresh_current must not call refresh_codex_oauth_pure again.
assert pool.try_refresh_current() is None
assert refresh_calls["count"] == 1
def test_codex_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
_write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
from agent.credential_pool import load_pool
import hermes_cli.auth as auth_mod
from hermes_cli.auth import AuthError
pool = load_pool("openai-codex")
assert pool.select() is not None
def _transient_failure(*_args, **_kwargs):
raise AuthError(
"Rate limited",
provider="openai-codex",
code="codex_refresh_failed",
relogin_required=False,
)
monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _transient_failure)
pool.try_refresh_current()
# Tokens must NOT be cleared from auth.json.
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
tokens = auth_payload["providers"]["openai-codex"].get("tokens", {})
assert tokens.get("access_token") == "old-access-token"
assert tokens.get("refresh_token") == "old-refresh-token"