fix(relay): trigger self-provision on relay-config + NAS token, not is_managed() (#48724)

self_provision_if_managed() gated on is_managed(), but is_managed() means
"NixOS/package-manager-managed" (it keys on HERMES_MANAGED or a ~/.hermes/.managed
marker) — NOT "NAS-hosted". A NAS-provisioned Fly agent sets NEITHER, so the gate
was always False and relay self-provision SILENTLY no-oped on exactly the hosted
agents it was built for. Caught live: a staging agent with GATEWAY_RELAY_URL
correctly stamped logged "No messaging platforms enabled" and never dialed the
connector; HERMES_MANAGED was unset on the machine. The unit tests had mocked
is_managed()->True, so they passed while the real trigger never fired (mocked-
trigger blind spot).

Fix: drop the is_managed() gate and rename self_provision_if_managed ->
self_provision_relay. The real trigger is now "relay_url() set + no pinned secret
+ a resolvable NAS token", which is both NAS-independent and self-guarding:
  - NAS-hosted agent: GATEWAY_RELAY_URL + no pinned secret + bootstrapped NAS
    token -> self-provisions.
  - Self-hosted + `hermes gateway enroll`: pinned GATEWAY_RELAY_SECRET -> skipped
    (existing secret-present guard).
  - Self-hosted, unenrolled, no NAS identity: resolve_nous_access_token() fails
    -> graceful no-op (existing fail-soft path).

Security: unchanged trust model. The connector still derives tenant from the
validated NAS token; this only broadens WHEN the provision attempt fires, and
every broadened case is still guarded by token-resolution + pinned-secret-skip.

Tests: replaced the (wrong) "skips when not managed" test with a regression test
proving a NAS host where is_managed()==False STILL provisions; renamed all call
sites; added a "no NAS token -> non-fatal skip" test for the self-hosted branch.
88 relay tests pass.

Relay-adapter lane. EXPERIMENTAL.
This commit is contained in:
Ben Barclay 2026-06-19 11:01:24 +10:00 committed by GitHub
parent 36851fa576
commit 2c6e266e88
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 70 additions and 41 deletions

View file

@ -204,21 +204,33 @@ def _post_provision(
return payload
def self_provision_if_managed() -> bool:
"""Managed-boot self-provision: mint relay creds in-process, no human, no disk.
def self_provision_relay() -> bool:
"""Boot-time relay self-provision: mint relay creds in-process, no human, no disk.
Fires only on a MANAGED boot (``is_managed()``) with relay configured
(``relay_url()`` set) and NO per-gateway secret already present. In that case
the runtime resolves the agent's own Nous access token (the same
Fires when relay is configured (``relay_url()`` set) and NO per-gateway secret
is already present, AND the agent can resolve its own Nous access token. In
that case the runtime resolves the agent's own Nous access token (the same
``resolve_nous_access_token()`` the enroll CLI / dashboard register use),
POSTs ``/relay/provision`` asserting its own endpoint + route keys, and sets
``GATEWAY_RELAY_ID`` / ``GATEWAY_RELAY_SECRET`` / ``GATEWAY_RELAY_DELIVERY_KEY``
into ``os.environ`` so the subsequent ``register_relay_adapter()`` picks them
up. The creds live ONLY in process memory never written to ``~/.hermes/.env``
(``save_env_value`` refuses under managed anyway, and keeping the secret off
any volume is the stronger posture).
up. The creds live ONLY in process memory never written to ``~/.hermes/.env``.
Stateless: process-env creds don't survive a restart, so a managed container
The trigger is deliberately NOT ``is_managed()``: that means
"package-manager/NixOS-managed" and is False on a NAS-hosted Fly agent (which
sets neither ``HERMES_MANAGED`` nor a ``.managed`` marker), so gating on it
blocked the exact hosted case this is for. The real signal is "you pointed me
at a connector and didn't pin a secret" — which is both NAS-independent and
self-guarding:
- A NAS-hosted agent: has ``GATEWAY_RELAY_URL``, no pinned secret, and a
bootstrapped NAS token -> self-provisions.
- A self-hosted operator who ran ``hermes gateway enroll``: has a PINNED
``GATEWAY_RELAY_SECRET`` -> skipped (the secret-present guard below).
- A self-hosted box with a relay URL but no NAS identity:
``resolve_nous_access_token()`` fails -> graceful no-op.
Stateless: process-env creds don't survive a restart, so a hosted container
re-provisions every boot; the connector's rotation window covers a still-
connected prior instance. An explicitly-pinned ``GATEWAY_RELAY_SECRET`` (env
or config) is RESPECTED self-provision skips so an operator pin isn't
@ -233,18 +245,12 @@ def self_provision_if_managed() -> bool:
logger = logging.getLogger("gateway.relay")
try:
from hermes_cli.config import is_managed
except Exception: # noqa: BLE001
return False
if not is_managed():
return False
dial_url = relay_url()
if not dial_url:
return False
# Respect an already-present (pinned/stamped) secret — don't stomp it.
# Respect an already-present (pinned/stamped) secret — don't stomp it. This
# is also what makes a self-hosted, enrolled gateway skip self-provision.
existing_id, existing_secret = relay_connection_auth()
if existing_id and existing_secret:
logger.info("relay self-provision skipped: GATEWAY_RELAY_SECRET already set")
@ -255,6 +261,8 @@ def self_provision_if_managed() -> bool:
access_token = resolve_nous_access_token()
except Exception as exc: # noqa: BLE001 - boot must survive a token failure
# No resolvable NAS identity (e.g. a self-hosted box that hasn't enrolled)
# -> nothing to provision with; skip quietly and let the gateway boot.
logger.warning("relay self-provision skipped: could not resolve Nous token (%s)", exc)
return False

View file

@ -5119,14 +5119,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
from gateway.relay import (
register_relay_adapter,
relay_url,
self_provision_if_managed,
self_provision_relay,
)
# Managed boot: self-provision relay creds in-process (resolve the
# agent's NAS token -> POST /relay/provision -> set GATEWAY_RELAY_* in
# os.environ) BEFORE registration reads them. No-op when not managed,
# relay unconfigured, or a secret is already pinned. Never raises.
self_provision_if_managed()
# Boot-time relay self-provision: resolve the agent's NAS token ->
# POST /relay/provision -> set GATEWAY_RELAY_* in os.environ BEFORE
# registration reads them. No-op when relay is unconfigured, a secret
# is already pinned, or no NAS token resolves (self-hosted, unenrolled).
# Never raises.
self_provision_relay()
if register_relay_adapter():
logger.info("relay adapter registered (connector at %s)", relay_url())

View file

@ -1,9 +1,13 @@
"""Unit tests for managed-boot relay self-provisioning.
"""Unit tests for boot-time relay self-provisioning.
Covers gateway.relay.self_provision_if_managed() + the relay_endpoint() /
Covers gateway.relay.self_provision_relay() + the relay_endpoint() /
relay_route_keys() config readers. The connector HTTP POST is monkeypatched
(the cross-repo E2E exercises the real /relay/provision); these prove the
TRIGGER logic, in-process env wiring, and fail-soft boot behaviour.
The trigger is deliberately NOT is_managed() (that means NixOS/package-manager-
managed, which is False on a NAS-hosted Fly agent). The real gate is
"relay_url set + no pinned secret + a resolvable NAS token".
"""
from __future__ import annotations
@ -48,8 +52,13 @@ def _stub_post(captured: dict):
return _fake
def _arm(monkeypatch, *, managed=True, url="wss://connector.example/relay", token="nas-token"):
monkeypatch.setattr("hermes_cli.config.is_managed", lambda: managed)
def _arm(monkeypatch, *, url="wss://connector.example/relay", token="nas-token"):
"""Arm the real trigger: a relay URL + a resolvable NAS token.
Note there is intentionally no `managed` knob self-provision no longer
consults is_managed(). A test that wants the "no NAS identity" branch
monkeypatches resolve_nous_access_token to raise instead.
"""
monkeypatch.setattr(relay, "relay_url", lambda: url)
monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", lambda: token)
@ -82,29 +91,37 @@ def test_provision_url_maps_ws_to_http():
# ─────────────────────────── trigger logic ───────────────────────────
def test_skips_when_not_managed(monkeypatch):
_arm(monkeypatch, managed=False)
called = {"n": 0}
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
assert relay.self_provision_if_managed() is False
assert called["n"] == 0
def test_provisions_on_nas_host_that_is_NOT_is_managed(monkeypatch):
"""Regression: a NAS-hosted Fly agent sets neither HERMES_MANAGED nor a
.managed marker, so is_managed() is False. Self-provision must STILL fire
the old is_managed() gate silently no-oped exactly this case in staging.
"""
# Force is_managed() False to model a real hosted agent; it must be irrelevant.
monkeypatch.setattr("hermes_cli.config.is_managed", lambda: False)
_arm(monkeypatch)
captured: dict = {}
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
assert relay.self_provision_relay() is True
assert relay.relay_connection_auth()[1] == "a" * 64
def test_skips_when_relay_not_configured(monkeypatch):
_arm(monkeypatch, url=None)
called = {"n": 0}
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
assert relay.self_provision_if_managed() is False
assert relay.self_provision_relay() is False
assert called["n"] == 0
def test_skips_when_secret_already_pinned(monkeypatch):
"""A self-hosted, enrolled gateway has a pinned secret -> never self-provisions."""
_arm(monkeypatch)
monkeypatch.setenv("GATEWAY_RELAY_ID", "gw-pinned")
monkeypatch.setenv("GATEWAY_RELAY_SECRET", "deadbeef")
called = {"n": 0}
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
assert relay.self_provision_if_managed() is False
assert relay.self_provision_relay() is False
assert called["n"] == 0
# The pinned secret is untouched.
assert relay.relay_connection_auth() == ("gw-pinned", "deadbeef")
@ -119,7 +136,7 @@ def test_provisions_and_sets_env_in_process(monkeypatch):
captured: dict = {}
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
assert relay.self_provision_if_managed() is True
assert relay.self_provision_relay() is True
# The connector POST carried the gateway-asserted endpoint + route keys.
assert captured["provision_url"] == "https://connector.example/relay/provision"
assert captured["access_token"] == "nas-token"
@ -138,7 +155,7 @@ def test_outbound_only_when_no_endpoint(monkeypatch):
captured: dict = {}
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
assert relay.self_provision_if_managed() is True
assert relay.self_provision_relay() is True
assert captured["gateway_endpoint"] is None
assert captured["route_keys"] == []
assert relay.relay_connection_auth()[1] == "a" * 64
@ -146,15 +163,18 @@ def test_outbound_only_when_no_endpoint(monkeypatch):
# ─────────────────────────── fail-soft ───────────────────────────
def test_token_failure_is_non_fatal(monkeypatch):
_arm(monkeypatch)
def test_no_nas_token_is_non_fatal(monkeypatch):
"""A self-hosted box with a relay URL but no resolvable NAS identity skips
quietly (this is the branch that replaces the old is_managed() gate for the
non-NAS case)."""
monkeypatch.setattr(relay, "relay_url", lambda: "wss://connector.example/relay")
def _boom():
raise RuntimeError("no token")
monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", _boom)
# Must not raise; returns False; no creds set.
assert relay.self_provision_if_managed() is False
assert relay.self_provision_relay() is False
assert relay.relay_connection_auth() == (None, None)
@ -165,5 +185,5 @@ def test_connector_failure_is_non_fatal(monkeypatch):
raise RuntimeError("connector returned HTTP 503")
monkeypatch.setattr(relay, "_post_provision", _boom)
assert relay.self_provision_if_managed() is False
assert relay.self_provision_relay() is False
assert relay.relay_connection_auth() == (None, None)