hermes-agent/plugins/dashboard_auth/nous/__init__.py
Ben a498485631 feat(dashboard-auth-nous): surface token iss/aud in verification-failure error
When jwt.decode raises InvalidTokenError, decode the token a second time
without signature verification (safe — we never trust the values, just
display them) and append the actual iss/aud claims plus our configured
expected values to the error message. Lets operators see config drift
between HERMES_DASHBOARD_PORTAL_URL / HERMES_DASHBOARD_OAUTH_CLIENT_ID
and what Portal is actually emitting without having to hand-decode the
JWT from the browser cookie.
2026-05-27 02:12:27 -07:00

507 lines
20 KiB
Python

"""NousDashboardAuthProvider — Nous Portal OAuth (authorization-code + PKCE).
Implements ``nous-account-service/docs/agent-dashboard-oauth-contract.md``
(PR #180). The plugin auto-loads (bundled, kind=backend) but only registers
its provider when the Portal-injected env var is present, so loopback /
``--insecure`` operators are unaffected.
Required env var (Portal injects at Fly.io provisioning):
HERMES_DASHBOARD_OAUTH_CLIENT_ID — shape ``agent:{agent_instance_id}``
Optional env var:
HERMES_DASHBOARD_PORTAL_URL — defaults to
``https://portal.nousresearch.com``
(production Portal). Override only
for staging (``portal.rewbs.uk``)
or a custom deployment.
Key contract points encoded here:
- client_id is per-instance (``agent:{instance_id}``); the suffix is also
cross-checked against the token's ``agent_instance_id`` claim as
defense-in-depth.
- scope is ``agent_dashboard:access`` only (no OIDC scopes).
- tokens are RS256 JWTs verified against ``/.well-known/jwks.json``;
JWKS is cached for 5 minutes.
- V1 has NO refresh tokens — ``refresh_session`` always raises
``RefreshExpiredError`` so the middleware redirects to ``/auth/login``.
- audience claim is the bare ``client_id`` (no ``hermes-cli:`` prefix).
- tolerant ``oauth_contract_version`` check: missing → warn + proceed;
present and ``!= 1`` → refuse.
The cookie payload returned by ``start_login`` stashes the PKCE
``code_verifier`` and the OAuth ``state`` parameter for the
``/auth/callback`` handler to retrieve. The auth-route layer is the owner
of cookie names; this provider just hands back ``{"code_verifier": …,
"state": …}`` and the route serializes those into the ``hermes_session_pkce``
cookie.
Forward compatibility: if a future Portal contract starts issuing refresh
tokens, ``complete_login`` already captures the value forward-compatibly
(populates ``Session.refresh_token``). Wiring the RT cookie back into the
middleware's near-expiry refresh path lives in the host application, not
here.
Skip reasons:
The plugin exposes a module-level ``LAST_SKIP_REASON`` that the gate's
fail-closed branch reads to surface a useful operator error message
("Set HERMES_DASHBOARD_OAUTH_CLIENT_ID …") instead of the bare "no
providers registered" the gate would otherwise emit.
"""
from __future__ import annotations
import base64
import hashlib
import logging
import os
import secrets
import urllib.parse
from typing import Any, Dict, Optional
import httpx
from hermes_cli.dashboard_auth import (
DashboardAuthProvider,
InvalidCodeError,
LoginStart,
ProviderError,
RefreshExpiredError,
Session,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
# Production Portal URL. Override via HERMES_DASHBOARD_PORTAL_URL for
# staging (portal.rewbs.uk) or a custom deployment. Contract docs name
# this as the production issuer.
_DEFAULT_PORTAL_URL = "https://portal.nousresearch.com"
# ---------------------------------------------------------------------------
# Skip-reason channel for operator-friendly error messages
# ---------------------------------------------------------------------------
#
# When the plugin loads but refuses to register (missing / malformed
# env vars), the auth gate downstream just sees "zero providers" and
# emits a generic "install a provider" error. That's misleading for the
# common case where the provider IS installed but mis-configured. The
# plugin writes the *specific* reason to this module-level slot; the
# gate reads it back when building its fail-closed SystemExit message.
#
# Cleared on every register() call so repeated dashboard starts in the
# same process (tests, hot-reload) don't leak stale reasons.
LAST_SKIP_REASON: str = ""
# ---------------------------------------------------------------------------
# Contract constants
# ---------------------------------------------------------------------------
# Contract C3: scope name for the dashboard flow.
_SCOPE = "agent_dashboard:access"
# Contract C11: emitted claim should equal 1; tolerant (warn) if missing.
_EXPECTED_CONTRACT_VERSION = 1
# Contract C7: JWKS Cache-Control max-age=300.
_JWKS_CACHE_SECONDS = 300
# httpx timeout for the token endpoint POST.
_TOKEN_ENDPOINT_TIMEOUT_SEC = 10.0
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _b64url_no_pad(raw: bytes) -> str:
"""Base64url-encode without ``=`` padding (RFC 7636 §4)."""
return base64.urlsafe_b64encode(raw).rstrip(b"=").decode()
# ---------------------------------------------------------------------------
# Provider
# ---------------------------------------------------------------------------
class NousDashboardAuthProvider(DashboardAuthProvider):
"""Nous Portal OAuth via authorization-code + PKCE (S256)."""
name = "nous"
display_name = "Nous Research"
def __init__(self, *, client_id: str, portal_url: str) -> None:
if not client_id.startswith("agent:"):
# Defense-in-depth. The plugin entry point already filters, but
# the provider should never be constructible with a malformed id.
raise ValueError(
"client_id must match contract shape 'agent:{instance_id}', "
f"got {client_id!r}"
)
self._client_id = client_id
self._agent_instance_id = client_id[len("agent:") :]
self._portal_url = portal_url.rstrip("/")
self._jwks_url = f"{self._portal_url}/.well-known/jwks.json"
self._authorize_url = f"{self._portal_url}/oauth/authorize"
self._token_url = f"{self._portal_url}/api/oauth/token"
# PyJWKClient is lazily imported so plugin discovery doesn't pay the
# crypto-import cost when the provider isn't activated.
self._jwks_client: Any = None
# ---- public API (DashboardAuthProvider) -------------------------------
def start_login(self, *, redirect_uri: str) -> LoginStart:
self._validate_redirect_uri(redirect_uri)
code_verifier = _b64url_no_pad(secrets.token_bytes(64)) # ~86 chars
code_challenge = _b64url_no_pad(
hashlib.sha256(code_verifier.encode("ascii")).digest()
)
state = _b64url_no_pad(secrets.token_bytes(32))
params = {
"response_type": "code",
"client_id": self._client_id,
"redirect_uri": redirect_uri,
"scope": _SCOPE,
"state": state,
"code_challenge": code_challenge,
"code_challenge_method": "S256",
}
redirect_url = f"{self._authorize_url}?{urllib.parse.urlencode(params)}"
# The auth-route layer expects ``cookie_payload[\"hermes_session_pkce\"]``
# as a single semicolon-delimited string of ``key=value`` segments,
# matching the stub provider's shape. The route handler prepends
# ``provider=`` so the callback knows which plugin to dispatch to.
cookie_payload = {
"hermes_session_pkce": f"state={state};verifier={code_verifier}",
}
return LoginStart(redirect_url=redirect_url, cookie_payload=cookie_payload)
def complete_login(
self,
*,
code: str,
state: str,
code_verifier: str,
redirect_uri: str,
) -> Session:
# ``state`` is verified by the auth-route layer before this call
# (it checks the cookie-stashed state matches the query-param state);
# we just receive it for symmetry with the protocol. Nous Portal
# doesn't re-check state at the token endpoint, so we ignore it here.
_ = state
try:
response = httpx.post(
self._token_url,
data={
"grant_type": "authorization_code",
"code": code,
"redirect_uri": redirect_uri,
"client_id": self._client_id,
"code_verifier": code_verifier,
},
headers={"Accept": "application/json"},
timeout=_TOKEN_ENDPOINT_TIMEOUT_SEC,
)
except httpx.RequestError as exc:
raise ProviderError(f"Portal token endpoint unreachable: {exc}") from exc
if response.status_code == 400:
# Contract: invalid_code, invalid_grant, redirect_uri_mismatch all
# surface as 400 with an OAuth-shaped JSON error envelope.
body = self._parse_json_body(response)
error_code = body.get("error", "invalid_request")
raise InvalidCodeError(f"Portal rejected code: {error_code}")
if response.status_code != 200:
raise ProviderError(
f"Portal token endpoint returned {response.status_code}: "
f"{response.text[:200]!r}"
)
payload = self._parse_json_body(response)
access_token = payload.get("access_token")
if not access_token or not isinstance(access_token, str):
raise ProviderError("Portal token response missing access_token")
token_type = str(payload.get("token_type", "")).lower()
if token_type and token_type != "bearer":
raise ProviderError(f"unexpected token_type={token_type!r}")
claims = self._verify_jwt(access_token)
# Contract V1: no refresh token expected. If a future Portal ever
# adds one, capture it forward-compatibly.
refresh_token = payload.get("refresh_token") or ""
if not isinstance(refresh_token, str):
refresh_token = ""
return self._session_from_claims(access_token, refresh_token, claims)
def refresh_session(self, *, refresh_token: str) -> Session:
# Contract V1 has no refresh tokens — always force re-auth. If a
# future Portal contract starts issuing them, this method needs to
# be re-implemented; until then it's an unconditional refusal.
raise RefreshExpiredError(
"Nous Portal does not issue refresh tokens in OAuth contract v1; "
"user must re-authenticate via /auth/login."
)
def verify_session(self, *, access_token: str) -> Optional[Session]:
# Contract: returns None on expiry/invalidity (middleware then
# triggers redirect-to-login since refresh_session can never succeed
# under V1); raises ProviderError if the IDP is unreachable.
try:
claims = self._verify_jwt(access_token)
except InvalidCodeError:
# Expired/invalid token — middleware contract is None, not raise.
return None
except ProviderError:
# JWKS unreachable, etc. Bubble up so middleware emits 503.
raise
# verify_session has no access to the original refresh_token; pass
# "" because in contract V1 there is none anyway.
return self._session_from_claims(access_token, "", claims)
def revoke_session(self, *, refresh_token: str) -> None:
# Contract V1: no refresh tokens to revoke, and no Portal revocation
# endpoint documented for dashboard tokens. Logout is purely
# client-side cookie clearing; this is a best-effort no-op.
_ = refresh_token
return None
# ---- internals --------------------------------------------------------
def _validate_redirect_uri(self, redirect_uri: str) -> None:
"""Surface obviously-broken redirect_uris before bouncing to Portal.
The Portal-side check (``agent-redirect-uri.ts``) is authoritative;
this is a fast-fail for the common operator-error case.
"""
parsed = urllib.parse.urlparse(redirect_uri)
if parsed.scheme not in ("https", "http"):
raise ProviderError(
f"redirect_uri must be http(s), got {redirect_uri!r}"
)
if parsed.scheme == "http" and parsed.hostname not in (
"localhost",
"127.0.0.1",
):
raise ProviderError(
"redirect_uri may only use http:// for localhost/127.0.0.1, "
f"got {redirect_uri!r}"
)
if not parsed.path or not parsed.path.endswith("/auth/callback"):
raise ProviderError(
"redirect_uri path must end with '/auth/callback', "
f"got {redirect_uri!r}"
)
def _parse_json_body(self, response: httpx.Response) -> Dict[str, Any]:
ctype = response.headers.get("content-type", "")
if not ctype.startswith("application/json"):
return {}
try:
body = response.json()
except ValueError:
return {}
return body if isinstance(body, dict) else {}
def _get_jwks_client(self) -> Any:
if self._jwks_client is None:
from jwt import PyJWKClient # lazy import
self._jwks_client = PyJWKClient(
self._jwks_url,
cache_keys=True,
lifespan=_JWKS_CACHE_SECONDS,
)
return self._jwks_client
def _verify_jwt(self, access_token: str) -> Dict[str, Any]:
# Lazy import — keeps startup fast for operators who never trigger
# the gated path.
import jwt
try:
signing_key = self._get_jwks_client().get_signing_key_from_jwt(
access_token
)
except jwt.PyJWKClientError as exc:
raise ProviderError(f"JWKS lookup failed: {exc}") from exc
except Exception as exc: # pragma: no cover - defensive
raise ProviderError(f"JWKS lookup failed: {exc!r}") from exc
try:
claims = jwt.decode(
access_token,
signing_key.key,
algorithms=["RS256"],
# Contract C2: aud is the bare client_id.
audience=self._client_id,
# Contract: issuer is the Portal base URL.
issuer=self._portal_url,
options={"require": ["exp", "iat", "aud", "iss", "sub"]},
)
except jwt.ExpiredSignatureError as exc:
# verify_session() catches this and returns None per protocol.
raise InvalidCodeError(f"access token expired: {exc}") from exc
except jwt.InvalidTokenError as exc:
# Surface the actual claim values that failed verification so
# operators don't have to dig into the JWT to debug config drift
# between HERMES_DASHBOARD_PORTAL_URL / HERMES_DASHBOARD_OAUTH_CLIENT_ID
# and what Portal is actually emitting. Decoding without verification
# is safe here: we've already failed to verify, and we never trust
# these values — they're surfaced for diagnostics only.
details = ""
try:
unverified = jwt.decode(
access_token,
options={"verify_signature": False, "verify_exp": False},
)
details = (
f" [token iss={unverified.get('iss')!r} "
f"aud={unverified.get('aud')!r}; "
f"expected iss={self._portal_url!r} "
f"aud={self._client_id!r}]"
)
except Exception:
pass
raise ProviderError(
f"access token verification failed: {exc}{details}"
) from exc
self._check_agent_instance_id(claims)
self._check_contract_version(claims)
return claims
def _check_agent_instance_id(self, claims: Dict[str, Any]) -> None:
"""Contract C9: cross-check agent_instance_id against our config."""
token_instance_id = claims.get("agent_instance_id")
if token_instance_id is None:
# Tolerated — the claim is documented as "should" not "must".
# Our audience check on the bare client_id already binds the
# token to this instance; agent_instance_id is defense-in-depth.
return
if token_instance_id != self._agent_instance_id:
raise ProviderError(
f"agent_instance_id mismatch: token={token_instance_id!r} "
f"vs configured={self._agent_instance_id!r}"
)
def _check_contract_version(self, claims: Dict[str, Any]) -> None:
"""Contract C11 — tolerant treatment per OQ-C2."""
contract_version = claims.get("oauth_contract_version")
if contract_version is None:
logger.warning(
"Nous Portal token missing oauth_contract_version claim "
"(contract says it should be %d); proceeding anyway.",
_EXPECTED_CONTRACT_VERSION,
)
return
if contract_version != _EXPECTED_CONTRACT_VERSION:
raise ProviderError(
f"unsupported oauth_contract_version={contract_version!r}, "
f"expected {_EXPECTED_CONTRACT_VERSION}"
)
def _session_from_claims(
self,
access_token: str,
refresh_token: str,
claims: Dict[str, Any],
) -> Session:
# Contract C4: no email / display_name in tokens. AuthWidget will
# show user_id (truncated). Session fields kept for forward-compat.
user_id = str(claims.get("sub", ""))
if not user_id:
raise ProviderError("token missing 'sub' (user_id) claim")
return Session(
user_id=user_id,
email="",
display_name="",
org_id=str(claims.get("org_id") or ""),
provider=self.name,
expires_at=int(claims["exp"]),
access_token=access_token,
refresh_token=refresh_token,
)
# ---------------------------------------------------------------------------
# Plugin entry point
# ---------------------------------------------------------------------------
def register(ctx) -> None:
"""Plugin entry — called by the plugin loader at startup.
Registers ``NousDashboardAuthProvider`` only when
``HERMES_DASHBOARD_OAUTH_CLIENT_ID`` is set (the Portal injects this
at Fly.io provisioning). ``HERMES_DASHBOARD_PORTAL_URL`` defaults to
production; override only for staging or custom deployments.
When skipping, writes a short human-readable reason to the module-
level :data:`LAST_SKIP_REASON` so the dashboard's fail-closed branch
can surface "Set HERMES_DASHBOARD_OAUTH_CLIENT_ID …" instead of the
bare "no providers registered" the gate would otherwise emit.
Operator-owned dashboards (loopback / ``--insecure``) leave the env
var unset, so this plugin is a no-op for them. The gate-engagement
layer (``hermes_cli.web_server.should_require_auth`` + the fail-
closed check in ``start_server``) handles the "public bind with zero
providers" case independently.
"""
global LAST_SKIP_REASON
LAST_SKIP_REASON = ""
client_id = os.environ.get("HERMES_DASHBOARD_OAUTH_CLIENT_ID", "").strip()
portal_url = (
os.environ.get("HERMES_DASHBOARD_PORTAL_URL", "").strip()
or _DEFAULT_PORTAL_URL
)
if not client_id:
LAST_SKIP_REASON = (
"HERMES_DASHBOARD_OAUTH_CLIENT_ID is not set. The Nous Portal "
"provisions this env var (shape 'agent:{instance_id}') when it "
"deploys a Hermes Agent instance — set it to your provisioned "
"client id, or pass --insecure to skip the OAuth gate entirely."
)
logger.debug("dashboard-auth-nous: %s", LAST_SKIP_REASON)
return
if not client_id.startswith("agent:"):
LAST_SKIP_REASON = (
f"HERMES_DASHBOARD_OAUTH_CLIENT_ID={client_id!r} doesn't match "
f"the contract shape 'agent:{{instance_id}}'. The Nous Portal "
f"provisions this value at deploy time; check your Fly app's "
f"secrets or override with the value from the Portal admin UI."
)
logger.warning("dashboard-auth-nous: %s", LAST_SKIP_REASON)
return
try:
provider = NousDashboardAuthProvider(
client_id=client_id, portal_url=portal_url
)
except ValueError as exc:
LAST_SKIP_REASON = f"NousDashboardAuthProvider construction failed: {exc}"
logger.warning("dashboard-auth-nous: %s", LAST_SKIP_REASON)
return
ctx.register_dashboard_auth_provider(provider)
logger.info(
"dashboard-auth-nous: registered provider (client_id=%s, portal=%s)",
client_id,
portal_url,
)