fix(auth): stop replaying invalid Nous refresh tokens

Quarantine Nous OAuth state when refresh fails with terminal invalid_grant/invalid_token errors. Clear local and shared refresh material across runtime, managed access-token, proxy, and credential-pool paths so Hermes stops retrying revoked refresh sessions.
This commit is contained in:
Robin Fernandes 2026-05-17 15:41:03 +10:00 committed by Teknium
parent 4c46c35ed0
commit c905562623
6 changed files with 338 additions and 14 deletions

View file

@ -929,6 +929,47 @@ class CredentialPool:
self._persist()
self._sync_device_code_entry_to_auth_store(updated)
return updated
if auth_mod._is_terminal_nous_refresh_error(exc):
logger.debug("Nous refresh token is terminally invalid; clearing local token state")
try:
with _auth_store_lock():
auth_store = _load_auth_store()
state = _load_provider_state(auth_store, "nous") or {
"client_id": entry.client_id,
"portal_base_url": entry.portal_base_url,
"inference_base_url": entry.inference_base_url,
"token_type": entry.token_type,
"scope": entry.scope,
"tls": entry.tls,
}
store_refresh = str(state.get("refresh_token") or "").strip()
entry_refresh = str(entry.refresh_token or "").strip()
if not store_refresh or store_refresh == entry_refresh:
auth_mod._quarantine_nous_oauth_state(
state,
exc,
reason="credential_pool_refresh_failure",
)
_save_provider_state(auth_store, "nous", state)
_save_auth_store(auth_store)
except Exception as clear_exc:
logger.debug("Failed to clear terminal Nous OAuth state: %s", clear_exc)
cleared = replace(
entry,
access_token=None,
refresh_token=None,
agent_key=None,
agent_key_expires_at=None,
)
self._replace_entry(entry, cleared)
self._persist()
self._mark_exhausted(
cleared,
401,
{"reason": getattr(exc, "code", None), "message": str(exc)},
)
return None
self._mark_exhausted(entry, None)
return None

View file

@ -3616,6 +3616,63 @@ def _read_shared_nous_state() -> Optional[Dict[str, Any]]:
return payload
def _clear_shared_nous_state(reason: str) -> None:
"""Remove the shared Nous OAuth store after a terminal token failure."""
try:
with _nous_shared_store_lock():
path = _nous_shared_store_path()
try:
path.unlink()
except FileNotFoundError:
pass
_oauth_trace("nous_shared_store_cleared", reason=reason)
except Exception as exc:
logger.debug("Failed to clear shared Nous auth store: %s", exc)
def _is_terminal_nous_refresh_error(exc: Exception) -> bool:
"""True when retrying the same Nous refresh token cannot succeed."""
return (
isinstance(exc, AuthError)
and exc.provider == "nous"
and exc.code in {"invalid_grant", "invalid_token"}
and bool(exc.relogin_required)
)
def _quarantine_nous_oauth_state(
state: Dict[str, Any],
error: AuthError,
*,
reason: str,
) -> None:
"""Keep routing metadata but remove dead OAuth material so it is not replayed."""
for key in (
"access_token",
"refresh_token",
"expires_at",
"expires_in",
"obtained_at",
"agent_key",
"agent_key_id",
"agent_key_expires_at",
"agent_key_expires_in",
"agent_key_reused",
"agent_key_obtained_at",
):
state.pop(key, None)
state["last_auth_error"] = {
"provider": "nous",
"code": error.code,
"message": str(error),
"reason": reason,
"relogin_required": True,
"at": datetime.now(timezone.utc).isoformat(),
}
_clear_shared_nous_state(reason)
invalidate_nous_auth_status_cache()
def _try_import_shared_nous_state(
*,
timeout_seconds: float = 15.0,
@ -3671,6 +3728,8 @@ def _try_import_shared_nous_state(
error_type=type(exc).__name__,
error_code=getattr(exc, "code", None),
)
if _is_terminal_nous_refresh_error(exc):
_clear_shared_nous_state("shared_import_terminal_refresh_failure")
logger.debug("Shared Nous import failed: %s", exc)
return None
except Exception as exc:
@ -3896,12 +3955,23 @@ def resolve_nous_access_token(
headers={"Accept": "application/json"},
verify=verify,
) as client:
refreshed = _refresh_access_token(
client=client,
portal_base_url=portal_base_url,
client_id=client_id,
refresh_token=refresh_token,
)
try:
refreshed = _refresh_access_token(
client=client,
portal_base_url=portal_base_url,
client_id=client_id,
refresh_token=refresh_token,
)
except AuthError as exc:
if _is_terminal_nous_refresh_error(exc):
_quarantine_nous_oauth_state(
state,
exc,
reason="managed_access_token_refresh_failure",
)
_save_provider_state(auth_store, "nous", state)
_save_auth_store(auth_store)
raise
now = datetime.now(timezone.utc)
access_ttl = _coerce_ttl_seconds(refreshed.get("expires_in"))
@ -4209,10 +4279,20 @@ def resolve_nous_runtime_credentials(
reason="access_expiring",
refresh_token_fp=_token_fingerprint(refresh_token),
)
refreshed = _refresh_access_token(
client=client, portal_base_url=portal_base_url,
client_id=client_id, refresh_token=refresh_token,
)
try:
refreshed = _refresh_access_token(
client=client, portal_base_url=portal_base_url,
client_id=client_id, refresh_token=refresh_token,
)
except AuthError as exc:
if _is_terminal_nous_refresh_error(exc):
_quarantine_nous_oauth_state(
state,
exc,
reason="runtime_access_refresh_failure",
)
_persist_state("terminal_runtime_access_refresh_failure")
raise
now = datetime.now(timezone.utc)
access_ttl = _coerce_ttl_seconds(refreshed.get("expires_in"))
previous_refresh_token = refresh_token
@ -4283,10 +4363,20 @@ def resolve_nous_runtime_credentials(
reason="mint_retry_after_invalid_token",
refresh_token_fp=_token_fingerprint(latest_refresh_token),
)
refreshed = _refresh_access_token(
client=client, portal_base_url=portal_base_url,
client_id=client_id, refresh_token=latest_refresh_token,
)
try:
refreshed = _refresh_access_token(
client=client, portal_base_url=portal_base_url,
client_id=client_id, refresh_token=latest_refresh_token,
)
except AuthError as exc:
if _is_terminal_nous_refresh_error(exc):
_quarantine_nous_oauth_state(
state,
exc,
reason="runtime_mint_retry_refresh_failure",
)
_persist_state("terminal_runtime_mint_retry_refresh_failure")
raise
now = datetime.now(timezone.utc)
access_ttl = _coerce_ttl_seconds(refreshed.get("expires_in"))
state["access_token"] = refreshed["access_token"]

View file

@ -16,8 +16,11 @@ import threading
from typing import Any, Dict, FrozenSet, Optional
from hermes_cli.auth import (
AuthError,
DEFAULT_NOUS_INFERENCE_URL,
_load_auth_store,
_is_terminal_nous_refresh_error,
_quarantine_nous_oauth_state,
_save_auth_store,
_write_shared_nous_state,
refresh_nous_oauth_from_state,
@ -81,6 +84,17 @@ class NousPortalAdapter(UpstreamAdapter):
try:
refreshed = refresh_nous_oauth_from_state(state)
except AuthError as exc:
if _is_terminal_nous_refresh_error(exc):
_quarantine_nous_oauth_state(
state,
exc,
reason="proxy_refresh_failure",
)
self._save_state(state)
raise RuntimeError(
f"Failed to refresh Nous Portal credentials: {exc}"
) from exc
except Exception as exc:
raise RuntimeError(
f"Failed to refresh Nous Portal credentials: {exc}"

View file

@ -510,6 +510,70 @@ def test_load_pool_migrates_nous_provider_state(tmp_path, monkeypatch):
assert entry.agent_key == "agent-key"
def test_nous_pool_terminal_refresh_clears_tokens(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.setenv("HERMES_SHARED_AUTH_DIR", str(tmp_path / "shared"))
_write_auth_store(
tmp_path,
{
"version": 1,
"active_provider": "nous",
"providers": {
"nous": {
"portal_base_url": "https://portal.example.com",
"inference_base_url": "https://inference.example.com/v1",
"client_id": "hermes-cli",
"token_type": "Bearer",
"scope": "inference:mint_agent_key",
"access_token": "access-token",
"refresh_token": "refresh-token",
"expires_at": "2026-03-24T12:00:00+00:00",
"agent_key": "agent-key",
"agent_key_expires_at": "2026-03-24T13:30:00+00:00",
}
},
},
)
from agent.credential_pool import load_pool
from hermes_cli import auth as auth_mod
from hermes_cli.auth import AuthError
refresh_calls = {"count": 0}
def _terminal_refresh_failure(*_args, **_kwargs):
refresh_calls["count"] += 1
raise AuthError(
"Refresh session has been revoked",
provider="nous",
code="invalid_grant",
relogin_required=True,
)
monkeypatch.setattr(auth_mod, "refresh_nous_oauth_from_state", _terminal_refresh_failure)
pool = load_pool("nous")
assert pool.select() is not None
assert pool.try_refresh_current() is None
entry = pool.entries()[0]
assert entry.last_status == "exhausted"
assert entry.last_error_code == 401
assert entry.refresh_token is None
assert entry.access_token is None
assert entry.agent_key is None
auth_payload = json.loads((tmp_path / "hermes" / "auth.json").read_text())
nous_state = auth_payload["providers"]["nous"]
assert not nous_state.get("refresh_token")
assert not nous_state.get("access_token")
assert not nous_state.get("agent_key")
assert nous_state["last_auth_error"]["code"] == "invalid_grant"
assert pool.try_refresh_current() is None
assert refresh_calls["count"] == 1
def test_load_pool_removes_stale_file_backed_singleton_entry(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)

View file

@ -373,6 +373,89 @@ def test_refresh_token_persisted_when_mint_times_out(tmp_path, monkeypatch):
assert state_after_failure["access_token"] == "access-1"
def test_terminal_refresh_failure_quarantines_tokens(
tmp_path, monkeypatch, shared_store_env,
):
"""A revoked/invalid Nous refresh token must not be replayed forever."""
from hermes_cli import auth as auth_mod
hermes_home = tmp_path / "hermes"
_setup_nous_auth(hermes_home, refresh_token="refresh-old")
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
shared_state = _full_state_fixture()
shared_state["access_token"] = "access-old"
shared_state["refresh_token"] = "refresh-old"
shared_state["expires_at"] = "2026-02-01T00:00:00+00:00"
auth_mod._write_shared_nous_state(shared_state)
refresh_calls: list[str] = []
def _terminal_refresh_failure(*, client, portal_base_url, client_id, refresh_token):
refresh_calls.append(refresh_token)
raise AuthError(
"Refresh session has been revoked",
provider="nous",
code="invalid_grant",
relogin_required=True,
)
monkeypatch.setattr(auth_mod, "_refresh_access_token", _terminal_refresh_failure)
with pytest.raises(AuthError, match="Refresh session has been revoked"):
auth_mod.resolve_nous_runtime_credentials(min_key_ttl_seconds=300)
state_after_failure = auth_mod.get_provider_auth_state("nous")
assert state_after_failure is not None
assert not state_after_failure.get("refresh_token")
assert not state_after_failure.get("access_token")
assert not state_after_failure.get("agent_key")
assert state_after_failure["last_auth_error"]["code"] == "invalid_grant"
assert auth_mod._read_shared_nous_state() is None
with pytest.raises(AuthError, match="No access token found"):
auth_mod.resolve_nous_runtime_credentials(min_key_ttl_seconds=300)
assert refresh_calls == ["refresh-old"]
def test_managed_access_token_refresh_failure_quarantines_tokens(
tmp_path, monkeypatch, shared_store_env,
):
from hermes_cli import auth as auth_mod
hermes_home = tmp_path / "hermes"
_setup_nous_auth(hermes_home, refresh_token="refresh-old")
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
refresh_calls: list[str] = []
def _terminal_refresh_failure(*, client, portal_base_url, client_id, refresh_token):
refresh_calls.append(refresh_token)
raise AuthError(
"Invalid refresh token",
provider="nous",
code="invalid_grant",
relogin_required=True,
)
monkeypatch.setattr(auth_mod, "_refresh_access_token", _terminal_refresh_failure)
with pytest.raises(AuthError, match="Invalid refresh token"):
auth_mod.resolve_nous_access_token()
state_after_failure = auth_mod.get_provider_auth_state("nous")
assert state_after_failure is not None
assert not state_after_failure.get("refresh_token")
assert not state_after_failure.get("access_token")
assert state_after_failure["last_auth_error"]["message"] == "Invalid refresh token"
with pytest.raises(AuthError, match="No access token found"):
auth_mod.resolve_nous_access_token()
assert refresh_calls == ["refresh-old"]
def test_mint_retry_uses_latest_rotated_refresh_token(tmp_path, monkeypatch):
hermes_home = tmp_path / "hermes"
_setup_nous_auth(hermes_home, refresh_token="refresh-old")
@ -1118,6 +1201,7 @@ def test_try_import_shared_returns_none_on_refresh_failure(
monkeypatch.setattr(auth_mod, "refresh_nous_oauth_from_state", _boom)
assert auth_mod._try_import_shared_nous_state() is None
assert auth_mod._read_shared_nous_state() is None
def test_try_import_shared_rehydrates_on_success(shared_store_env, monkeypatch):

View file

@ -164,6 +164,37 @@ def test_nous_adapter_get_credential_raises_on_refresh_failure(tmp_path, monkeyp
adapter.get_credential()
def test_nous_adapter_quarantines_terminal_refresh_failure(tmp_path, monkeypatch):
from hermes_cli.auth import AuthError
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
_write_auth_store(tmp_path, {
"access_token": "access-tok",
"refresh_token": "refresh-tok",
"agent_key": "stale-agent-key",
})
with patch(
"hermes_cli.proxy.adapters.nous_portal.refresh_nous_oauth_from_state",
side_effect=AuthError(
"Refresh session has been revoked",
provider="nous",
code="invalid_grant",
relogin_required=True,
),
):
adapter = NousPortalAdapter()
with pytest.raises(RuntimeError, match="Refresh session has been revoked"):
adapter.get_credential()
stored = json.loads((tmp_path / "auth.json").read_text())
nous_state = stored["providers"]["nous"]
assert not nous_state.get("refresh_token")
assert not nous_state.get("access_token")
assert not nous_state.get("agent_key")
assert nous_state["last_auth_error"]["code"] == "invalid_grant"
def test_nous_adapter_get_credential_raises_when_no_agent_key_returned(tmp_path, monkeypatch):
"""If the refresh helper succeeds but produces no agent_key, we surface a clear error."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))