fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions

When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403, token revoked or reused), _mark_exhausted was called but auth.json was left with the dead credentials. On the next session, _seed_from_singletons re-read auth.json and re-seeded the pool with the same revoked token, triggering the same terminal failure in a loop. Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine block in _refresh_entry: when a terminal error is detected and auth.json holds no newer tokens, clear access_token/refresh_token from auth.json and remove all device_code-sourced pool entries from memory. Mirrors the Nous quarantine added in c90556262 and the xAI quarantine in #28116. Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure, matching the xAI and Nous patterns, to avoid refresh_token_reused races when multiple Hermes processes share the same auth.json singleton. Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems); fix re-applied surgically on current main with their predicate and tests preserved.
2026-06-06 07:51:53 +00:00 · 2026-05-18 10:31:13 -07:00 · 2026-05-18 10:31:13 -07:00 · b570e0fdd0
commit b570e0fdd0
parent 9aae59feab
3 changed files with 237 additions and 0 deletions
--- a/tests/agent/test_credential_pool.py
+++ b/tests/agent/test_credential_pool.py
@ -1963,3 +1963,144 @@ def test_xai_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch
    tokens = auth_payload["providers"]["xai-oauth"].get("tokens", {})
    assert tokens.get("access_token") == "old-access-token"
    assert tokens.get("refresh_token") == "old-refresh-token"
+
+
+# ---------------------------------------------------------------------------
+# Codex OAuth terminal error quarantine
+# ---------------------------------------------------------------------------
+
+
+def _codex_auth_store(access_token: str, refresh_token: str) -> dict:
+    return {
+        "version": 1,
+        "active_provider": "openai-codex",
+        "providers": {
+            "openai-codex": {
+                "tokens": {
+                    "access_token": access_token,
+                    "refresh_token": refresh_token,
+                },
+            }
+        },
+    }
+
+
+def test_is_terminal_codex_oauth_refresh_error():
+    from hermes_cli.auth import AuthError, _is_terminal_codex_oauth_refresh_error
+
+    assert _is_terminal_codex_oauth_refresh_error(
+        AuthError("Refresh failed", provider="openai-codex", code="codex_refresh_failed", relogin_required=True)
+    )
+    assert _is_terminal_codex_oauth_refresh_error(
+        AuthError("No token", provider="openai-codex", code="codex_auth_missing_refresh_token", relogin_required=True)
+    )
+    assert _is_terminal_codex_oauth_refresh_error(
+        AuthError("Revoked", provider="openai-codex", code="invalid_grant", relogin_required=True)
+    )
+    assert _is_terminal_codex_oauth_refresh_error(
+        AuthError("Reused", provider="openai-codex", code="refresh_token_reused", relogin_required=True)
+    )
+    # transient 429/5xx: relogin_required=False -> not terminal
+    assert not _is_terminal_codex_oauth_refresh_error(
+        AuthError("Rate limit", provider="openai-codex", code="codex_refresh_failed", relogin_required=False)
+    )
+    # xAI error does not trigger Codex check
+    assert not _is_terminal_codex_oauth_refresh_error(
+        AuthError("Revoked", provider="xai-oauth", code="xai_refresh_failed", relogin_required=True)
+    )
+    # Generic exception
+    assert not _is_terminal_codex_oauth_refresh_error(ValueError("oops"))
+
+
+def test_codex_oauth_terminal_refresh_clears_auth_json_and_removes_pool_entries(
+    tmp_path, monkeypatch
+):
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
+
+    _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
+
+    from agent.credential_pool import PooledCredential, load_pool
+    import hermes_cli.auth as auth_mod
+    from hermes_cli.auth import AuthError
+
+    pool = load_pool("openai-codex")
+    selected = pool.select()
+    assert selected is not None
+    assert selected.source == "device_code"
+
+    # Add a manual API-key entry that must survive the quarantine.
+    pool.add_entry(PooledCredential.from_dict("openai-codex", {
+        "id": "manual-key",
+        "source": "manual",
+        "auth_type": "api_key",
+        "access_token": "manual-codex-key",
+    }))
+
+    refresh_calls = {"count": 0}
+
+    def _terminal_refresh_failure(*_args, **_kwargs):
+        refresh_calls["count"] += 1
+        raise AuthError(
+            "Refresh session has been revoked",
+            provider="openai-codex",
+            code="codex_refresh_failed",
+            relogin_required=True,
+        )
+
+    monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _terminal_refresh_failure)
+
+    assert pool.try_refresh_current() is None
+
+    # Only the manual entry survives.
+    assert [entry.id for entry in pool.entries()] == ["manual-key"]
+
+    # Auth.json tokens must be cleared.
+    auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
+    codex_state = auth_payload["providers"]["openai-codex"]
+    tokens = codex_state.get("tokens", {})
+    assert not tokens.get("access_token")
+    assert not tokens.get("refresh_token")
+    assert codex_state["last_auth_error"]["code"] == "codex_refresh_failed"
+    assert codex_state["last_auth_error"]["relogin_required"] is True
+
+    # Persisted pool must also have only the manual entry.
+    assert [entry["id"] for entry in auth_payload["credential_pool"]["openai-codex"]] == ["manual-key"]
+
+    # A second try_refresh_current must not call refresh_codex_oauth_pure again.
+    assert pool.try_refresh_current() is None
+    assert refresh_calls["count"] == 1
+
+
+def test_codex_oauth_nonterminal_refresh_does_not_quarantine(tmp_path, monkeypatch):
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.delenv("CODEX_OAUTH_ACCESS_TOKEN", raising=False)
+
+    _write_auth_store(tmp_path, _codex_auth_store("old-access-token", "old-refresh-token"))
+
+    from agent.credential_pool import load_pool
+    import hermes_cli.auth as auth_mod
+    from hermes_cli.auth import AuthError
+
+    pool = load_pool("openai-codex")
+    assert pool.select() is not None
+
+    def _transient_failure(*_args, **_kwargs):
+        raise AuthError(
+            "Rate limited",
+            provider="openai-codex",
+            code="codex_refresh_failed",
+            relogin_required=False,
+        )
+
+    monkeypatch.setattr(auth_mod, "refresh_codex_oauth_pure", _transient_failure)
+
+    pool.try_refresh_current()
+
+    # Tokens must NOT be cleared from auth.json.
+    auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
+    tokens = auth_payload["providers"]["openai-codex"].get("tokens", {})
+    assert tokens.get("access_token") == "old-access-token"
+    assert tokens.get("refresh_token") == "old-refresh-token"