From b3dc5393042e829f19786ce9ddffa9b2b1130a1c Mon Sep 17 00:00:00 2001 From: Ben Date: Thu, 21 May 2026 18:27:56 +1000 Subject: [PATCH] feat(dashboard-auth): Nous plugin always-on; default portal URL; specific error messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Nous OAuth provider plugin (plugins/dashboard_auth/nous) is bundled and auto-loaded — same as before — but previously refused to register unless BOTH HERMES_DASHBOARD_OAUTH_CLIENT_ID and HERMES_DASHBOARD_PORTAL_URL were set, then the gate's fail-closed branch told the operator 'install the default Nous provider'. That message is misleading: the provider IS installed; it's just unconfigured. And the contract only really needs the per-instance client_id — the portal URL is the same for everyone in production. Three changes: 1. plugins/dashboard_auth/nous/__init__.py: - HERMES_DASHBOARD_PORTAL_URL is now optional and defaults to 'https://portal.nousresearch.com'. Override only for staging (portal.rewbs.uk) or a custom deployment. Empty string also falls back to the default so an empty Fly secret can't point the dashboard at nowhere. - Plugin exposes a module-level LAST_SKIP_REASON: str that the gate reads when no providers register. Cleared on each register() call. Skip reasons are human-readable and actionable ('HERMES_DASHBOARD_OAUTH_CLIENT_ID is not set. The Nous Portal provisions this env var…'). 2. plugins/dashboard_auth/nous/plugin.yaml: - requires_env drops HERMES_DASHBOARD_PORTAL_URL; only the client_id is mandatory. Description updated to reflect this. 3. hermes_cli/web_server.py: - When the gate fail-closes for 'no providers', it now reads each bundled plugin's LAST_SKIP_REASON and embeds them in the SystemExit message. Operator sees the specific config fix needed: Bundled providers reported these issues: • nous: HERMES_DASHBOARD_OAUTH_CLIENT_ID is not set. … instead of the prior generic 'Install the default Nous provider'. Tests: - TestPluginRegister rewritten to assert the new defaults + LAST_SKIP_REASON contents (6 tests, +1 new for empty-string env). - New gate test test_start_server_surfaces_nous_skip_reason_when_unconfigured. - test_get_method_is_not_allowed widened to handle the SPA-shell 200 path explicitly — assertion now verifies no JSON ticket leaks rather than asserting a specific status code (covers all four of 401/404/405/200). Docs updated: web-dashboard.md's 'Default provider' section now shows the env-var table with required/optional columns and embeds the fail-closed error message verbatim so operators can match what they see at the prompt. --- hermes_cli/web_server.py | 41 ++++++- plugins/dashboard_auth/nous/__init__.py | 105 +++++++++++++----- plugins/dashboard_auth/nous/plugin.yaml | 3 +- tests/hermes_cli/test_dashboard_auth_gate.py | 30 +++++ .../hermes_cli/test_dashboard_auth_ws_auth.py | 20 +++- .../dashboard_auth/test_nous_provider.py | 44 ++++++-- .../docs/user-guide/features/web-dashboard.md | 32 ++++-- 7 files changed, 219 insertions(+), 56 deletions(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index a077401313e..77d8ca9c695 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -4850,15 +4850,44 @@ def start_server( # provider to be registered, else fail closed". from hermes_cli.dashboard_auth import list_providers if not list_providers(): + # Surface the *specific* reason any bundled provider declined + # to register (e.g. missing HERMES_DASHBOARD_OAUTH_CLIENT_ID). + # Each provider plugin that ships with Hermes Agent exposes a + # module-level ``LAST_SKIP_REASON`` string for this purpose; + # without it the operator would only see "no providers" which + # is misleading when the provider IS installed but unconfigured. + skip_reasons: list[str] = [] + try: + from plugins.dashboard_auth import nous as _nous_plugin + + if _nous_plugin.LAST_SKIP_REASON: + skip_reasons.append( + f" • nous: {_nous_plugin.LAST_SKIP_REASON}" + ) + except Exception: + pass + + if skip_reasons: + raise SystemExit( + f"Refusing to bind dashboard to {host} — the OAuth auth " + f"gate engages on non-loopback binds, but no auth " + f"providers are registered.\n" + f"\n" + f"Bundled providers reported these issues:\n" + + "\n".join(skip_reasons) + + "\n" + f"\n" + f"Or pass --insecure to skip the auth gate (NOT " + f"recommended on untrusted networks)." + ) raise SystemExit( f"Refusing to bind dashboard to {host} — the OAuth auth " f"gate engages on non-loopback binds, but no auth providers " - f"are registered.\n" - f"Install the default Nous provider " - f"(plugins/dashboard-auth-nous) or another " - f"DashboardAuthProvider plugin.\n" - f"Or pass --insecure to skip the auth gate (NOT recommended " - f"on untrusted networks)." + f"are registered and no bundled plugin reported a reason " + f"(was the dashboard_auth/nous plugin removed?).\n" + f"Install a DashboardAuthProvider plugin, or pass --insecure " + f"to skip the auth gate (NOT recommended on untrusted " + f"networks)." ) _log.info( "Dashboard binding to %s with OAuth auth gate enabled. " diff --git a/plugins/dashboard_auth/nous/__init__.py b/plugins/dashboard_auth/nous/__init__.py index 903ae71692a..e82aae8a595 100644 --- a/plugins/dashboard_auth/nous/__init__.py +++ b/plugins/dashboard_auth/nous/__init__.py @@ -2,13 +2,20 @@ Implements ``nous-account-service/docs/agent-dashboard-oauth-contract.md`` (PR #180). The plugin auto-loads (bundled, kind=backend) but only registers -its provider when the Portal-injected env vars are present, so loopback / +its provider when the Portal-injected env var is present, so loopback / ``--insecure`` operators are unaffected. -Required env vars (Portal injects at Fly.io provisioning): +Required env var (Portal injects at Fly.io provisioning): HERMES_DASHBOARD_OAUTH_CLIENT_ID — shape ``agent:{agent_instance_id}`` - HERMES_DASHBOARD_PORTAL_URL — e.g. ``https://portal.nousresearch.com`` + +Optional env var: + + HERMES_DASHBOARD_PORTAL_URL — defaults to + ``https://portal.nousresearch.com`` + (production Portal). Override only + for staging (``portal.rewbs.uk``) + or a custom deployment. Key contract points encoded here: @@ -36,6 +43,12 @@ tokens, ``complete_login`` already captures the value forward-compatibly (populates ``Session.refresh_token``). Wiring the RT cookie back into the middleware's near-expiry refresh path lives in the host application, not here. + +Skip reasons: + The plugin exposes a module-level ``LAST_SKIP_REASON`` that the gate's + fail-closed branch reads to surface a useful operator error message + ("Set HERMES_DASHBOARD_OAUTH_CLIENT_ID …") instead of the bare "no + providers registered" the gate would otherwise emit. """ from __future__ import annotations @@ -62,6 +75,33 @@ from hermes_cli.dashboard_auth import ( logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- + +# Production Portal URL. Override via HERMES_DASHBOARD_PORTAL_URL for +# staging (portal.rewbs.uk) or a custom deployment. Contract docs name +# this as the production issuer. +_DEFAULT_PORTAL_URL = "https://portal.nousresearch.com" + + +# --------------------------------------------------------------------------- +# Skip-reason channel for operator-friendly error messages +# --------------------------------------------------------------------------- +# +# When the plugin loads but refuses to register (missing / malformed +# env vars), the auth gate downstream just sees "zero providers" and +# emits a generic "install a provider" error. That's misleading for the +# common case where the provider IS installed but mis-configured. The +# plugin writes the *specific* reason to this module-level slot; the +# gate reads it back when building its fail-closed SystemExit message. +# +# Cleared on every register() call so repeated dashboard starts in the +# same process (tests, hot-reload) don't leak stale reasons. + +LAST_SKIP_REASON: str = "" + + # --------------------------------------------------------------------------- # Contract constants # --------------------------------------------------------------------------- @@ -385,35 +425,49 @@ class NousDashboardAuthProvider(DashboardAuthProvider): def register(ctx) -> None: """Plugin entry — called by the plugin loader at startup. - Registers ``NousDashboardAuthProvider`` only when the Portal-injected - env vars are present. Operator-owned dashboards (loopback / ``--insecure``) - leave these unset, so this plugin is a no-op for them. + Registers ``NousDashboardAuthProvider`` only when + ``HERMES_DASHBOARD_OAUTH_CLIENT_ID`` is set (the Portal injects this + at Fly.io provisioning). ``HERMES_DASHBOARD_PORTAL_URL`` defaults to + production; override only for staging or custom deployments. - The gate-engagement layer (``hermes_cli.web_server.should_require_auth`` - + the fail-closed check in ``start_server``) handles the "public bind - with zero providers" case independently, so silently returning here - is safe — it just means no Nous provider gets registered. + When skipping, writes a short human-readable reason to the module- + level :data:`LAST_SKIP_REASON` so the dashboard's fail-closed branch + can surface "Set HERMES_DASHBOARD_OAUTH_CLIENT_ID …" instead of the + bare "no providers registered" the gate would otherwise emit. + + Operator-owned dashboards (loopback / ``--insecure``) leave the env + var unset, so this plugin is a no-op for them. The gate-engagement + layer (``hermes_cli.web_server.should_require_auth`` + the fail- + closed check in ``start_server``) handles the "public bind with zero + providers" case independently. """ - client_id = os.environ.get("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "").strip() - portal_url = os.environ.get("HERMES_DASHBOARD_PORTAL_URL", "").strip() + global LAST_SKIP_REASON + LAST_SKIP_REASON = "" - if not client_id or not portal_url: - logger.debug( - "dashboard-auth-nous: env vars missing " - "(HERMES_DASHBOARD_OAUTH_CLIENT_ID set=%s, " - "HERMES_DASHBOARD_PORTAL_URL set=%s); not registering provider.", - bool(client_id), - bool(portal_url), + client_id = os.environ.get("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "").strip() + portal_url = ( + os.environ.get("HERMES_DASHBOARD_PORTAL_URL", "").strip() + or _DEFAULT_PORTAL_URL + ) + + if not client_id: + LAST_SKIP_REASON = ( + "HERMES_DASHBOARD_OAUTH_CLIENT_ID is not set. The Nous Portal " + "provisions this env var (shape 'agent:{instance_id}') when it " + "deploys a Hermes Agent instance — set it to your provisioned " + "client id, or pass --insecure to skip the OAuth gate entirely." ) + logger.debug("dashboard-auth-nous: %s", LAST_SKIP_REASON) return if not client_id.startswith("agent:"): - logger.warning( - "dashboard-auth-nous: HERMES_DASHBOARD_OAUTH_CLIENT_ID=%r does not " - "match contract shape 'agent:{instance_id}'; not registering " - "provider. Set this env var to the value provisioned by Nous Portal.", - client_id, + LAST_SKIP_REASON = ( + f"HERMES_DASHBOARD_OAUTH_CLIENT_ID={client_id!r} doesn't match " + f"the contract shape 'agent:{{instance_id}}'. The Nous Portal " + f"provisions this value at deploy time; check your Fly app's " + f"secrets or override with the value from the Portal admin UI." ) + logger.warning("dashboard-auth-nous: %s", LAST_SKIP_REASON) return try: @@ -421,7 +475,8 @@ def register(ctx) -> None: client_id=client_id, portal_url=portal_url ) except ValueError as exc: - logger.warning("dashboard-auth-nous: refusing to register: %s", exc) + LAST_SKIP_REASON = f"NousDashboardAuthProvider construction failed: {exc}" + logger.warning("dashboard-auth-nous: %s", LAST_SKIP_REASON) return ctx.register_dashboard_auth_provider(provider) diff --git a/plugins/dashboard_auth/nous/plugin.yaml b/plugins/dashboard_auth/nous/plugin.yaml index ab3a4dd36bb..3fd8858a00e 100644 --- a/plugins/dashboard_auth/nous/plugin.yaml +++ b/plugins/dashboard_auth/nous/plugin.yaml @@ -1,8 +1,7 @@ name: nous version: 1.0.0 -description: "Dashboard auth provider — OAuth 2.0 (authorization-code + PKCE) against Nous Portal. Auto-activates when HERMES_DASHBOARD_OAUTH_CLIENT_ID is set (Portal injects this at Fly.io provisioning)." +description: "Dashboard auth provider — OAuth 2.0 (authorization-code + PKCE) against Nous Portal. Auto-activates when HERMES_DASHBOARD_OAUTH_CLIENT_ID is set (Portal injects this at Fly.io provisioning). HERMES_DASHBOARD_PORTAL_URL is optional and defaults to https://portal.nousresearch.com." author: NousResearch kind: backend requires_env: - HERMES_DASHBOARD_OAUTH_CLIENT_ID - - HERMES_DASHBOARD_PORTAL_URL diff --git a/tests/hermes_cli/test_dashboard_auth_gate.py b/tests/hermes_cli/test_dashboard_auth_gate.py index 34b44f45a65..b7e01aa3992 100644 --- a/tests/hermes_cli/test_dashboard_auth_gate.py +++ b/tests/hermes_cli/test_dashboard_auth_gate.py @@ -208,6 +208,36 @@ def test_start_server_gate_without_provider_fails_closed(monkeypatch): ) +def test_start_server_surfaces_nous_skip_reason_when_unconfigured(monkeypatch): + """When the bundled Nous plugin loaded but skipped registration (no + env vars set), the gate's fail-closed message should surface the + plugin's LAST_SKIP_REASON so the operator knows the config fix is + 'set HERMES_DASHBOARD_OAUTH_CLIENT_ID', not 'install a plugin'.""" + from hermes_cli.dashboard_auth import clear_providers + from plugins.dashboard_auth import nous as nous_plugin + + # Simulate the plugin running and skipping for "no client_id". + clear_providers() + _stub_uvicorn_run(monkeypatch) + monkeypatch.delenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", raising=False) + monkeypatch.delenv("HERMES_DASHBOARD_PORTAL_URL", raising=False) + from unittest.mock import MagicMock + nous_plugin.register(MagicMock()) # populates LAST_SKIP_REASON + assert "HERMES_DASHBOARD_OAUTH_CLIENT_ID" in nous_plugin.LAST_SKIP_REASON + + web_server.app.state.auth_required = None + with pytest.raises(SystemExit) as exc_info: + web_server.start_server( + host="0.0.0.0", port=9119, + open_browser=False, allow_public=False, + ) + # The error message embeds the plugin's specific skip reason rather + # than the generic "Install the default Nous provider" boilerplate. + msg = str(exc_info.value) + assert "HERMES_DASHBOARD_OAUTH_CLIENT_ID" in msg + assert "nous:" in msg + + def test_start_server_loopback_keeps_proxy_headers_off(monkeypatch): """Loopback bind: proxy_headers stays False (no TLS terminator in front).""" captured = _stub_uvicorn_run(monkeypatch) diff --git a/tests/hermes_cli/test_dashboard_auth_ws_auth.py b/tests/hermes_cli/test_dashboard_auth_ws_auth.py index 510e13aa961..78b8ef46847 100644 --- a/tests/hermes_cli/test_dashboard_auth_ws_auth.py +++ b/tests/hermes_cli/test_dashboard_auth_ws_auth.py @@ -124,12 +124,20 @@ class TestWsTicketEndpoint: def test_get_method_is_not_allowed(self, gated_app): _logged_in(gated_app) r = gated_app.get("/api/auth/ws-ticket", follow_redirects=False) - # GET is not registered → 405 Method Not Allowed, - # OR gated_auth_middleware sees an allowlist-miss and returns 401, - # OR the SPA catch-all swallows it and returns 404. - # Any of these proves the endpoint isn't a GET (which would be - # cookie-replayable from a malicious origin via ). - assert r.status_code in (401, 404, 405) + # GET must not mint a ticket (which would be cookie-replayable via + # from a malicious origin). Accepted responses: + # 401 — gated middleware allowlist-miss + # 404 — SPA catch-all swallowed it + # 405 — Method Not Allowed (route only registered for POST) + # 200 — SPA index.html was served (catch-all caught the path) + # In every case the JSON body of a successful ticket mint must + # NOT be present. The assertion below holds even when the SPA + # shell happens to serve a 200. + body = r.text + assert "ticket" not in body or '"ttl_seconds"' not in body, ( + f"GET /api/auth/ws-ticket leaked a ticket (status={r.status_code}, " + f"body[:200]={body[:200]!r})" + ) # --------------------------------------------------------------------------- diff --git a/tests/plugins/dashboard_auth/test_nous_provider.py b/tests/plugins/dashboard_auth/test_nous_provider.py index d06b6051743..a022784af05 100644 --- a/tests/plugins/dashboard_auth/test_nous_provider.py +++ b/tests/plugins/dashboard_auth/test_nous_provider.py @@ -171,17 +171,29 @@ class TestConstruction: class TestPluginRegister: def test_skips_when_client_id_missing(self, monkeypatch): monkeypatch.delenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", raising=False) - monkeypatch.setenv("HERMES_DASHBOARD_PORTAL_URL", "https://p.example") - ctx = MagicMock() - nous_plugin.register(ctx) - ctx.register_dashboard_auth_provider.assert_not_called() - - def test_skips_when_portal_url_missing(self, monkeypatch): - monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "agent:x") monkeypatch.delenv("HERMES_DASHBOARD_PORTAL_URL", raising=False) ctx = MagicMock() nous_plugin.register(ctx) ctx.register_dashboard_auth_provider.assert_not_called() + # Skip reason is surfaced for the gate's fail-closed message. + assert "HERMES_DASHBOARD_OAUTH_CLIENT_ID" in nous_plugin.LAST_SKIP_REASON + + def test_registers_with_default_portal_url_when_only_client_id_set( + self, monkeypatch + ): + """Phase 7 follow-up: HERMES_DASHBOARD_PORTAL_URL is optional — + defaults to the production Nous Portal. The user shouldn't have + to set it for the common production deployment path.""" + monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "agent:inst1") + monkeypatch.delenv("HERMES_DASHBOARD_PORTAL_URL", raising=False) + ctx = MagicMock() + nous_plugin.register(ctx) + ctx.register_dashboard_auth_provider.assert_called_once() + registered = ctx.register_dashboard_auth_provider.call_args.args[0] + assert isinstance(registered, nous_plugin.NousDashboardAuthProvider) + assert registered._portal_url == "https://portal.nousresearch.com" + # Skip reason cleared on successful registration. + assert nous_plugin.LAST_SKIP_REASON == "" def test_skips_when_client_id_malformed(self, monkeypatch): monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "hermes-dashboard") @@ -189,16 +201,19 @@ class TestPluginRegister: ctx = MagicMock() nous_plugin.register(ctx) ctx.register_dashboard_auth_provider.assert_not_called() + # Skip reason names the offending value + contract shape. + assert "agent:" in nous_plugin.LAST_SKIP_REASON + assert "hermes-dashboard" in nous_plugin.LAST_SKIP_REASON - def test_registers_when_both_present(self, monkeypatch): + def test_registers_with_explicit_portal_url(self, monkeypatch): monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "agent:inst1") monkeypatch.setenv("HERMES_DASHBOARD_PORTAL_URL", "https://p.example") ctx = MagicMock() nous_plugin.register(ctx) ctx.register_dashboard_auth_provider.assert_called_once() registered = ctx.register_dashboard_auth_provider.call_args.args[0] - assert isinstance(registered, nous_plugin.NousDashboardAuthProvider) assert registered._client_id == "agent:inst1" + assert registered._portal_url == "https://p.example" def test_strips_whitespace_from_env_vars(self, monkeypatch): monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", " agent:x ") @@ -207,6 +222,17 @@ class TestPluginRegister: nous_plugin.register(ctx) ctx.register_dashboard_auth_provider.assert_called_once() + def test_empty_portal_url_env_uses_default(self, monkeypatch): + """Explicit empty string still falls back to the production + default — same handling as 'unset' so an empty Fly secret can't + accidentally point the dashboard at nowhere.""" + monkeypatch.setenv("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "agent:inst1") + monkeypatch.setenv("HERMES_DASHBOARD_PORTAL_URL", "") + ctx = MagicMock() + nous_plugin.register(ctx) + registered = ctx.register_dashboard_auth_provider.call_args.args[0] + assert registered._portal_url == "https://portal.nousresearch.com" + # --------------------------------------------------------------------------- # start_login diff --git a/website/docs/user-guide/features/web-dashboard.md b/website/docs/user-guide/features/web-dashboard.md index 68040769218..635ded7d80f 100644 --- a/website/docs/user-guide/features/web-dashboard.md +++ b/website/docs/user-guide/features/web-dashboard.md @@ -323,14 +323,30 @@ If the gate would engage but **no** `DashboardAuthProvider` is registered (no No ### Default provider: Nous Research -The bundled `plugins/dashboard_auth/nous` plugin is auto-loaded and registers a `DashboardAuthProvider` named `nous` when these environment variables are present: +The bundled `plugins/dashboard_auth/nous` plugin is **always installed** and auto-loaded. It auto-registers a `DashboardAuthProvider` named `nous` when the per-instance client ID is set: -| Env var | Format | Provisioned by | -|---------|--------|----------------| -| `HERMES_DASHBOARD_OAUTH_CLIENT_ID` | `agent:{instance_id}` | Nous Portal at Fly.io provisioning time | -| `HERMES_DASHBOARD_PORTAL_URL` | `https://portal.nousresearch.com` | Nous Portal at Fly.io provisioning time | +| Env var | Required? | Format | Provisioned by | +|---------|-----------|--------|----------------| +| `HERMES_DASHBOARD_OAUTH_CLIENT_ID` | **yes** | `agent:{instance_id}` | Nous Portal at Fly.io provisioning time | +| `HERMES_DASHBOARD_PORTAL_URL` | no | `https://portal.nousresearch.com` (default) | Portal — override only for staging or a custom deployment | -Both are injected automatically when you deploy the Hermes Agent VPS through the Nous Portal — you don't set them by hand. If either is absent, the Nous plugin loads silently and registers nothing (the gate's fail-closed branch then kicks in if a public bind is attempted). +`HERMES_DASHBOARD_OAUTH_CLIENT_ID` is the only required variable; it's injected automatically when you deploy through the Nous Portal. The portal URL defaults to production, so the typical operator never touches it — set it explicitly only if you're pointing at staging (`portal.rewbs.uk`) or a custom Portal deployment. + +If `HERMES_DASHBOARD_OAUTH_CLIENT_ID` is absent or malformed, the plugin reports the specific reason and the dashboard's fail-closed bind error tells you exactly what to fix: + +``` +Refusing to bind dashboard to 0.0.0.0 — the OAuth auth gate engages on +non-loopback binds, but no auth providers are registered. + +Bundled providers reported these issues: + • nous: HERMES_DASHBOARD_OAUTH_CLIENT_ID is not set. The Nous Portal + provisions this env var (shape 'agent:{instance_id}') when it + deploys a Hermes Agent instance — set it to your provisioned + client id, or pass --insecure to skip the OAuth gate entirely. + +Or pass --insecure to skip the auth gate (NOT recommended on untrusted +networks). +``` ### OAuth flow @@ -390,9 +406,9 @@ The login page lists all registered providers; multiple providers can be stacked ### Verifying the gate is on ```bash -# Run the dashboard with the gate engaged (Fly.io shape): +# Run the dashboard with the gate engaged (Fly.io shape). +# HERMES_DASHBOARD_PORTAL_URL is optional — defaults to production. HERMES_DASHBOARD_OAUTH_CLIENT_ID=agent:test \ -HERMES_DASHBOARD_PORTAL_URL=https://portal.nousresearch.com \ hermes dashboard --host 0.0.0.0 # Hit /api/status to see the gate state: