fix: 24h cooldown for 401/403 auth failures + user notification

Previously, credentials exhausted due to 401 (invalid token) or 403
(forbidden) used the same 1-hour cooldown as 429 rate limits. This meant
the system would retry an invalid token every hour forever — burning API
calls and confusing users who had no idea why their primary provider
wasn't being used.

Changes:
- credential_pool: EXHAUSTED_TTL_AUTH_SECONDS = 24h for 401/403 errors
  (rate limits keep 1h cooldown, provider reset_at timestamps still
  override both)
- run_agent: emit actionable status message via _emit_status() when all
  pool credentials are rejected — tells the user to run
  `hermes auth reset <provider>` or `hermes model` to re-authenticate.
  Message propagates to both CLI (force-printed) and gateway (Telegram,
  Discord, etc.)
- Tests for all three TTL cases (401 stays exhausted at 1h, 401 resets
  at 24h, 403 stays exhausted at 1h) and auth exhaustion notification
  (emits when pool exhausted, silent when rotation succeeds)

Addresses user report: Copilot 401 + Codex 429 caused silent fallback
with no recovery path visible to the user.
This commit is contained in:
Teknium 2026-04-14 21:00:45 -07:00
parent 82f364ffd1
commit 0c9715f2ff
No known key found for this signature in database
4 changed files with 205 additions and 3 deletions

View file

@ -69,10 +69,10 @@ SUPPORTED_POOL_STRATEGIES = {
}
# Cooldown before retrying an exhausted credential.
# 429 (rate-limited) and 402 (billing/quota) both cool down after 1 hour.
# Provider-supplied reset_at timestamps override these defaults.
EXHAUSTED_TTL_429_SECONDS = 60 * 60 # 1 hour
EXHAUSTED_TTL_DEFAULT_SECONDS = 60 * 60 # 1 hour
EXHAUSTED_TTL_429_SECONDS = 60 * 60 # 1 hour (rate limits)
EXHAUSTED_TTL_AUTH_SECONDS = 24 * 60 * 60 # 24 hours (401/403 — token invalid)
EXHAUSTED_TTL_DEFAULT_SECONDS = 60 * 60 # 1 hour (everything else)
# Pool key prefix for custom OpenAI-compatible endpoints.
# Custom endpoints all share provider='custom' but are keyed by their
@ -193,6 +193,10 @@ def _exhausted_ttl(error_code: Optional[int]) -> int:
"""Return cooldown seconds based on the HTTP status that caused exhaustion."""
if error_code == 429:
return EXHAUSTED_TTL_429_SECONDS
if error_code in (401, 403):
# Auth failures are permanent until the user re-authenticates.
# Use a long cooldown to avoid retrying dead tokens every hour.
return EXHAUSTED_TTL_AUTH_SECONDS
return EXHAUSTED_TTL_DEFAULT_SECONDS

View file

@ -4754,6 +4754,14 @@ class AIAgent:
)
self._swap_credential(next_entry)
return True, False
# All credentials for this provider are exhausted due to auth failure.
# Emit an actionable notification so the user knows how to fix it.
_provider_label = getattr(self, "provider", "unknown")
self._emit_status(
f"🔐 All {_provider_label} credentials rejected (HTTP {rotate_status}). "
f"Run `hermes auth reset {_provider_label}` to clear, "
f"or `hermes model` to re-authenticate."
)
return False, has_retried_429

View file

@ -1156,3 +1156,117 @@ def test_load_pool_does_not_seed_qwen_oauth_when_no_token(tmp_path, monkeypatch)
assert not pool.has_credentials()
assert pool.entries() == []
# ---------------------------------------------------------------------------
# Auth failure TTL — 401/403 credentials stay exhausted for 24 hours
# ---------------------------------------------------------------------------
def test_exhausted_401_entry_stays_exhausted_after_one_hour(tmp_path, monkeypatch):
"""401-exhausted credentials should NOT reset after just 1 hour (token invalid)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openrouter": [
{
"id": "cred-1",
"label": "primary",
"auth_type": "api_key",
"priority": 0,
"source": "manual",
"access_token": "***",
"base_url": "https://openrouter.ai/api/v1",
"last_status": "exhausted",
"last_status_at": time.time() - 3700, # ~1h2m ago
"last_error_code": 401,
}
]
},
},
)
from agent.credential_pool import load_pool
pool = load_pool("openrouter")
entry = pool.select()
# 401 uses a 24-hour cooldown — 1 hour is NOT enough to reset
assert entry is None
def test_exhausted_401_entry_resets_after_24_hours(tmp_path, monkeypatch):
"""401-exhausted credentials should reset after 24 hours."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openrouter": [
{
"id": "cred-1",
"label": "primary",
"auth_type": "api_key",
"priority": 0,
"source": "manual",
"access_token": "***",
"base_url": "https://openrouter.ai/api/v1",
"last_status": "exhausted",
"last_status_at": time.time() - 90000, # ~25 hours ago
"last_error_code": 401,
}
]
},
},
)
from agent.credential_pool import load_pool
pool = load_pool("openrouter")
entry = pool.select()
assert entry is not None
assert entry.id == "cred-1"
assert entry.last_status == "ok"
def test_exhausted_403_entry_stays_exhausted_after_one_hour(tmp_path, monkeypatch):
"""403-exhausted credentials should NOT reset after just 1 hour."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
_write_auth_store(
tmp_path,
{
"version": 1,
"credential_pool": {
"openrouter": [
{
"id": "cred-1",
"label": "primary",
"auth_type": "api_key",
"priority": 0,
"source": "manual",
"access_token": "***",
"base_url": "https://openrouter.ai/api/v1",
"last_status": "exhausted",
"last_status_at": time.time() - 3700, # ~1h2m ago
"last_error_code": 403,
}
]
},
},
)
from agent.credential_pool import load_pool
pool = load_pool("openrouter")
entry = pool.select()
# 403 uses a 24-hour cooldown — 1 hour is NOT enough to reset
assert entry is None

View file

@ -348,3 +348,79 @@ class TestPoolRotationCycle:
)
assert recovered is False
assert has_retried is False
class TestAuthExhaustionNotification:
"""Verify user-facing notification when all credentials are rejected (401)."""
def _make_agent_with_empty_auth_pool(self):
from run_agent import AIAgent
with patch.object(AIAgent, "__init__", lambda self, **kw: None):
agent = AIAgent()
pool = MagicMock()
pool.has_credentials.return_value = True
pool.try_refresh_current.return_value = None
pool.mark_exhausted_and_rotate.return_value = None # no more credentials
agent._credential_pool = pool
agent._swap_credential = MagicMock()
agent.log_prefix = ""
agent.provider = "copilot"
agent.status_callback = None
# Capture _emit_status calls
agent._emit_status_calls = []
original_emit = getattr(AIAgent, "_emit_status", None)
def capture_emit(self_inner, msg):
agent._emit_status_calls.append(msg)
agent._emit_status = lambda msg: capture_emit(agent, msg)
return agent, pool
def test_auth_failure_emits_notification_when_pool_exhausted(self):
"""When all credentials are 401'd, user should see actionable message."""
from agent.error_classifier import FailoverReason
agent, pool = self._make_agent_with_empty_auth_pool()
recovered, _ = agent._recover_with_credential_pool(
status_code=401, has_retried_429=False,
classified_reason=FailoverReason.auth,
)
assert recovered is False
assert len(agent._emit_status_calls) == 1
msg = agent._emit_status_calls[0]
assert "copilot" in msg
assert "401" in msg
assert "hermes auth reset" in msg
def test_auth_failure_no_notification_when_rotation_succeeds(self):
"""When rotation succeeds, no exhaustion warning should be emitted."""
from agent.error_classifier import FailoverReason
from run_agent import AIAgent
with patch.object(AIAgent, "__init__", lambda self, **kw: None):
agent = AIAgent()
next_entry = MagicMock()
next_entry.id = "cred-2"
pool = MagicMock()
pool.has_credentials.return_value = True
pool.try_refresh_current.return_value = None
pool.mark_exhausted_and_rotate.return_value = next_entry
agent._credential_pool = pool
agent._swap_credential = MagicMock()
agent.log_prefix = ""
agent.provider = "copilot"
agent._emit_status_calls = []
agent._emit_status = lambda msg: agent._emit_status_calls.append(msg)
recovered, _ = agent._recover_with_credential_pool(
status_code=401, has_retried_429=False,
classified_reason=FailoverReason.auth,
)
assert recovered is True
assert len(agent._emit_status_calls) == 0