mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-06 07:51:53 +00:00
fix(codex-oauth): quarantine terminal refresh errors so dead tokens are not replayed across sessions
When a Codex OAuth refresh token is permanently invalidated (HTTP 400/401/403,
token revoked or reused), _mark_exhausted was called but auth.json was left with
the dead credentials. On the next session, _seed_from_singletons re-read
auth.json and re-seeded the pool with the same revoked token, triggering the
same terminal failure in a loop.
Add _is_terminal_codex_oauth_refresh_error to auth.py and a matching quarantine
block in _refresh_entry: when a terminal error is detected and auth.json holds
no newer tokens, clear access_token/refresh_token from auth.json and remove all
device_code-sourced pool entries from memory. Mirrors the Nous quarantine added
in c90556262 and the xAI quarantine in #28116.
Also add a pre-refresh sync from auth.json before calling refresh_codex_oauth_pure,
matching the xAI and Nous patterns, to avoid refresh_token_reused races when
multiple Hermes processes share the same auth.json singleton.
Salvaged from #27911 by @EloquentBrush0x — contributor's branch was severely
stale (would have reverted ~5000 LOC across azure/kanban/i18n subsystems);
fix re-applied surgically on current main with their predicate and tests preserved.
This commit is contained in:
parent
9aae59feab
commit
b570e0fdd0
3 changed files with 237 additions and 0 deletions
|
|
@ -797,6 +797,13 @@ class CredentialPool:
|
|||
except Exception as wexc:
|
||||
logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
|
||||
elif self.provider == "openai-codex":
|
||||
# Adopt fresher tokens from auth.json before spending the
|
||||
# refresh_token — single-use tokens consumed by another Hermes
|
||||
# process sharing the same auth.json singleton would otherwise
|
||||
# trigger ``refresh_token_reused`` on the next POST.
|
||||
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||
if synced is not entry:
|
||||
entry = synced
|
||||
refreshed = auth_mod.refresh_codex_oauth_pure(
|
||||
entry.access_token,
|
||||
entry.refresh_token,
|
||||
|
|
@ -951,6 +958,72 @@ class CredentialPool:
|
|||
self._current_id = None
|
||||
self._persist()
|
||||
return None
|
||||
# For openai-codex: same race as xAI/nous — another Hermes process
|
||||
# may have consumed the refresh token between our proactive sync
|
||||
# and the HTTP call. Re-check auth.json and adopt the fresh tokens
|
||||
# if they have rotated since.
|
||||
if self.provider == "openai-codex":
|
||||
synced = self._sync_codex_entry_from_auth_store(entry)
|
||||
if synced.refresh_token != entry.refresh_token:
|
||||
logger.debug(
|
||||
"Codex OAuth refresh failed but auth.json has newer tokens — adopting"
|
||||
)
|
||||
updated = replace(
|
||||
synced,
|
||||
last_status=STATUS_OK,
|
||||
last_status_at=None,
|
||||
last_error_code=None,
|
||||
last_error_reason=None,
|
||||
last_error_message=None,
|
||||
last_error_reset_at=None,
|
||||
)
|
||||
self._replace_entry(synced, updated)
|
||||
self._persist()
|
||||
return updated
|
||||
# Terminal error: auth.json has no newer tokens — the stored
|
||||
# refresh_token is dead. Clear it from auth.json so the next
|
||||
# session does not re-seed the same revoked credentials, and
|
||||
# remove all singleton-seeded (device_code) entries from the
|
||||
# in-memory pool. Mirrors the xAI and Nous quarantine paths.
|
||||
if auth_mod._is_terminal_codex_oauth_refresh_error(exc):
|
||||
logger.debug(
|
||||
"Codex OAuth refresh token is terminally invalid; clearing local token state"
|
||||
)
|
||||
try:
|
||||
with _auth_store_lock():
|
||||
auth_store = _load_auth_store()
|
||||
state = _load_provider_state(auth_store, "openai-codex") or {}
|
||||
if isinstance(state, dict):
|
||||
tokens = state.get("tokens") or {}
|
||||
if isinstance(tokens, dict):
|
||||
store_refresh = str(tokens.get("refresh_token") or "").strip()
|
||||
entry_refresh = str(entry.refresh_token or "").strip()
|
||||
if not store_refresh or store_refresh == entry_refresh:
|
||||
tokens.pop("access_token", None)
|
||||
tokens.pop("refresh_token", None)
|
||||
state["tokens"] = tokens
|
||||
state["last_auth_error"] = {
|
||||
"provider": "openai-codex",
|
||||
"code": getattr(exc, "code", "unknown"),
|
||||
"message": str(exc),
|
||||
"reason": "credential_pool_refresh_failure",
|
||||
"relogin_required": True,
|
||||
"at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
_save_provider_state(auth_store, "openai-codex", state)
|
||||
_save_auth_store(auth_store)
|
||||
except Exception as clear_exc:
|
||||
logger.debug(
|
||||
"Failed to clear terminal Codex OAuth state: %s", clear_exc
|
||||
)
|
||||
self._entries = [
|
||||
item for item in self._entries
|
||||
if item.source != "device_code"
|
||||
]
|
||||
if self._current_id == entry.id:
|
||||
self._current_id = None
|
||||
self._persist()
|
||||
return None
|
||||
# For nous: another process may have consumed the refresh token
|
||||
# between our proactive sync and the HTTP call. Re-sync from
|
||||
# auth.json and adopt the fresh tokens if available.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue