fix: 24h cooldown for 401/403 auth failures + user notification

Previously, credentials exhausted due to 401 (invalid token) or 403 (forbidden) used the same 1-hour cooldown as 429 rate limits. This meant the system would retry an invalid token every hour forever — burning API calls and confusing users who had no idea why their primary provider wasn't being used. Changes: - credential_pool: EXHAUSTED_TTL_AUTH_SECONDS = 24h for 401/403 errors (rate limits keep 1h cooldown, provider reset_at timestamps still override both) - run_agent: emit actionable status message via _emit_status() when all pool credentials are rejected — tells the user to run `hermes auth reset <provider>` or `hermes model` to re-authenticate. Message propagates to both CLI (force-printed) and gateway (Telegram, Discord, etc.) - Tests for all three TTL cases (401 stays exhausted at 1h, 401 resets at 24h, 403 stays exhausted at 1h) and auth exhaustion notification (emits when pool exhausted, silent when rotation succeeds) Addresses user report: Copilot 401 + Codex 429 caused silent fallback with no recovery path visible to the user.
2026-04-25 00:51:20 +00:00 · 2026-04-14 21:00:45 -07:00 · 2026-04-14 21:00:45 -07:00 · 0c9715f2ff
commit 0c9715f2ff
parent 82f364ffd1
4 changed files with 205 additions and 3 deletions
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@ -69,10 +69,10 @@ SUPPORTED_POOL_STRATEGIES = {
 }

 # Cooldown before retrying an exhausted credential.
-# 429 (rate-limited) and 402 (billing/quota) both cool down after 1 hour.
 # Provider-supplied reset_at timestamps override these defaults.
-EXHAUSTED_TTL_429_SECONDS = 60 * 60          # 1 hour
-EXHAUSTED_TTL_DEFAULT_SECONDS = 60 * 60      # 1 hour
+EXHAUSTED_TTL_429_SECONDS = 60 * 60          # 1 hour  (rate limits)
+EXHAUSTED_TTL_AUTH_SECONDS = 24 * 60 * 60    # 24 hours (401/403 — token invalid)
+EXHAUSTED_TTL_DEFAULT_SECONDS = 60 * 60      # 1 hour  (everything else)

 # Pool key prefix for custom OpenAI-compatible endpoints.
 # Custom endpoints all share provider='custom' but are keyed by their
@ -193,6 +193,10 @@ def _exhausted_ttl(error_code: Optional[int]) -> int:
    """Return cooldown seconds based on the HTTP status that caused exhaustion."""
    if error_code == 429:
        return EXHAUSTED_TTL_429_SECONDS
+    if error_code in (401, 403):
+        # Auth failures are permanent until the user re-authenticates.
+        # Use a long cooldown to avoid retrying dead tokens every hour.
+        return EXHAUSTED_TTL_AUTH_SECONDS
    return EXHAUSTED_TTL_DEFAULT_SECONDS


--- a/run_agent.py
+++ b/run_agent.py
@ -4754,6 +4754,14 @@ class AIAgent:
                )
                self._swap_credential(next_entry)
                return True, False
+            # All credentials for this provider are exhausted due to auth failure.
+            # Emit an actionable notification so the user knows how to fix it.
+            _provider_label = getattr(self, "provider", "unknown")
+            self._emit_status(
+                f"🔐 All {_provider_label} credentials rejected (HTTP {rotate_status}). "
+                f"Run `hermes auth reset {_provider_label}` to clear, "
+                f"or `hermes model` to re-authenticate."
+            )

        return False, has_retried_429

--- a/tests/agent/test_credential_pool.py
+++ b/tests/agent/test_credential_pool.py
@ -1156,3 +1156,117 @@ def test_load_pool_does_not_seed_qwen_oauth_when_no_token(tmp_path, monkeypatch)

    assert not pool.has_credentials()
    assert pool.entries() == []
+
+
+# ---------------------------------------------------------------------------
+# Auth failure TTL — 401/403 credentials stay exhausted for 24 hours
+# ---------------------------------------------------------------------------
+
+
+def test_exhausted_401_entry_stays_exhausted_after_one_hour(tmp_path, monkeypatch):
+    """401-exhausted credentials should NOT reset after just 1 hour (token invalid)."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
+    _write_auth_store(
+        tmp_path,
+        {
+            "version": 1,
+            "credential_pool": {
+                "openrouter": [
+                    {
+                        "id": "cred-1",
+                        "label": "primary",
+                        "auth_type": "api_key",
+                        "priority": 0,
+                        "source": "manual",
+                        "access_token": "***",
+                        "base_url": "https://openrouter.ai/api/v1",
+                        "last_status": "exhausted",
+                        "last_status_at": time.time() - 3700,  # ~1h2m ago
+                        "last_error_code": 401,
+                    }
+                ]
+            },
+        },
+    )
+
+    from agent.credential_pool import load_pool
+
+    pool = load_pool("openrouter")
+    entry = pool.select()
+
+    # 401 uses a 24-hour cooldown — 1 hour is NOT enough to reset
+    assert entry is None
+
+
+def test_exhausted_401_entry_resets_after_24_hours(tmp_path, monkeypatch):
+    """401-exhausted credentials should reset after 24 hours."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
+    _write_auth_store(
+        tmp_path,
+        {
+            "version": 1,
+            "credential_pool": {
+                "openrouter": [
+                    {
+                        "id": "cred-1",
+                        "label": "primary",
+                        "auth_type": "api_key",
+                        "priority": 0,
+                        "source": "manual",
+                        "access_token": "***",
+                        "base_url": "https://openrouter.ai/api/v1",
+                        "last_status": "exhausted",
+                        "last_status_at": time.time() - 90000,  # ~25 hours ago
+                        "last_error_code": 401,
+                    }
+                ]
+            },
+        },
+    )
+
+    from agent.credential_pool import load_pool
+
+    pool = load_pool("openrouter")
+    entry = pool.select()
+
+    assert entry is not None
+    assert entry.id == "cred-1"
+    assert entry.last_status == "ok"
+
+
+def test_exhausted_403_entry_stays_exhausted_after_one_hour(tmp_path, monkeypatch):
+    """403-exhausted credentials should NOT reset after just 1 hour."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path / "hermes"))
+    monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
+    _write_auth_store(
+        tmp_path,
+        {
+            "version": 1,
+            "credential_pool": {
+                "openrouter": [
+                    {
+                        "id": "cred-1",
+                        "label": "primary",
+                        "auth_type": "api_key",
+                        "priority": 0,
+                        "source": "manual",
+                        "access_token": "***",
+                        "base_url": "https://openrouter.ai/api/v1",
+                        "last_status": "exhausted",
+                        "last_status_at": time.time() - 3700,  # ~1h2m ago
+                        "last_error_code": 403,
+                    }
+                ]
+            },
+        },
+    )
+
+    from agent.credential_pool import load_pool
+
+    pool = load_pool("openrouter")
+    entry = pool.select()
+
+    # 403 uses a 24-hour cooldown — 1 hour is NOT enough to reset
+    assert entry is None
--- a/tests/agent/test_credential_pool_routing.py
+++ b/tests/agent/test_credential_pool_routing.py
@ -348,3 +348,79 @@ class TestPoolRotationCycle:
        )
        assert recovered is False
        assert has_retried is False
+
+
+class TestAuthExhaustionNotification:
+    """Verify user-facing notification when all credentials are rejected (401)."""
+
+    def _make_agent_with_empty_auth_pool(self):
+        from run_agent import AIAgent
+
+        with patch.object(AIAgent, "__init__", lambda self, **kw: None):
+            agent = AIAgent()
+
+        pool = MagicMock()
+        pool.has_credentials.return_value = True
+        pool.try_refresh_current.return_value = None
+        pool.mark_exhausted_and_rotate.return_value = None  # no more credentials
+        agent._credential_pool = pool
+        agent._swap_credential = MagicMock()
+        agent.log_prefix = ""
+        agent.provider = "copilot"
+        agent.status_callback = None
+
+        # Capture _emit_status calls
+        agent._emit_status_calls = []
+        original_emit = getattr(AIAgent, "_emit_status", None)
+
+        def capture_emit(self_inner, msg):
+            agent._emit_status_calls.append(msg)
+        agent._emit_status = lambda msg: capture_emit(agent, msg)
+
+        return agent, pool
+
+    def test_auth_failure_emits_notification_when_pool_exhausted(self):
+        """When all credentials are 401'd, user should see actionable message."""
+        from agent.error_classifier import FailoverReason
+
+        agent, pool = self._make_agent_with_empty_auth_pool()
+
+        recovered, _ = agent._recover_with_credential_pool(
+            status_code=401, has_retried_429=False,
+            classified_reason=FailoverReason.auth,
+        )
+        assert recovered is False
+        assert len(agent._emit_status_calls) == 1
+        msg = agent._emit_status_calls[0]
+        assert "copilot" in msg
+        assert "401" in msg
+        assert "hermes auth reset" in msg
+
+    def test_auth_failure_no_notification_when_rotation_succeeds(self):
+        """When rotation succeeds, no exhaustion warning should be emitted."""
+        from agent.error_classifier import FailoverReason
+        from run_agent import AIAgent
+
+        with patch.object(AIAgent, "__init__", lambda self, **kw: None):
+            agent = AIAgent()
+
+        next_entry = MagicMock()
+        next_entry.id = "cred-2"
+        pool = MagicMock()
+        pool.has_credentials.return_value = True
+        pool.try_refresh_current.return_value = None
+        pool.mark_exhausted_and_rotate.return_value = next_entry
+        agent._credential_pool = pool
+        agent._swap_credential = MagicMock()
+        agent.log_prefix = ""
+        agent.provider = "copilot"
+
+        agent._emit_status_calls = []
+        agent._emit_status = lambda msg: agent._emit_status_calls.append(msg)
+
+        recovered, _ = agent._recover_with_credential_pool(
+            status_code=401, has_retried_429=False,
+            classified_reason=FailoverReason.auth,
+        )
+        assert recovered is True
+        assert len(agent._emit_status_calls) == 0