From 2c6e266e8829f9aaff1be4666afdbb05ca15fc6d Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Fri, 19 Jun 2026 11:01:24 +1000 Subject: [PATCH] fix(relay): trigger self-provision on relay-config + NAS token, not is_managed() (#48724) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit self_provision_if_managed() gated on is_managed(), but is_managed() means "NixOS/package-manager-managed" (it keys on HERMES_MANAGED or a ~/.hermes/.managed marker) — NOT "NAS-hosted". A NAS-provisioned Fly agent sets NEITHER, so the gate was always False and relay self-provision SILENTLY no-oped on exactly the hosted agents it was built for. Caught live: a staging agent with GATEWAY_RELAY_URL correctly stamped logged "No messaging platforms enabled" and never dialed the connector; HERMES_MANAGED was unset on the machine. The unit tests had mocked is_managed()->True, so they passed while the real trigger never fired (mocked- trigger blind spot). Fix: drop the is_managed() gate and rename self_provision_if_managed -> self_provision_relay. The real trigger is now "relay_url() set + no pinned secret + a resolvable NAS token", which is both NAS-independent and self-guarding: - NAS-hosted agent: GATEWAY_RELAY_URL + no pinned secret + bootstrapped NAS token -> self-provisions. - Self-hosted + `hermes gateway enroll`: pinned GATEWAY_RELAY_SECRET -> skipped (existing secret-present guard). - Self-hosted, unenrolled, no NAS identity: resolve_nous_access_token() fails -> graceful no-op (existing fail-soft path). Security: unchanged trust model. The connector still derives tenant from the validated NAS token; this only broadens WHEN the provision attempt fires, and every broadened case is still guarded by token-resolution + pinned-secret-skip. Tests: replaced the (wrong) "skips when not managed" test with a regression test proving a NAS host where is_managed()==False STILL provisions; renamed all call sites; added a "no NAS token -> non-fatal skip" test for the self-hosted branch. 88 relay tests pass. Relay-adapter lane. EXPERIMENTAL. --- gateway/relay/__init__.py | 42 +++++++++------- gateway/run.py | 13 ++--- tests/gateway/relay/test_self_provision.py | 56 +++++++++++++++------- 3 files changed, 70 insertions(+), 41 deletions(-) diff --git a/gateway/relay/__init__.py b/gateway/relay/__init__.py index a0bd4f526ef..4b3fdda8a8d 100644 --- a/gateway/relay/__init__.py +++ b/gateway/relay/__init__.py @@ -204,21 +204,33 @@ def _post_provision( return payload -def self_provision_if_managed() -> bool: - """Managed-boot self-provision: mint relay creds in-process, no human, no disk. +def self_provision_relay() -> bool: + """Boot-time relay self-provision: mint relay creds in-process, no human, no disk. - Fires only on a MANAGED boot (``is_managed()``) with relay configured - (``relay_url()`` set) and NO per-gateway secret already present. In that case - the runtime resolves the agent's own Nous access token (the same + Fires when relay is configured (``relay_url()`` set) and NO per-gateway secret + is already present, AND the agent can resolve its own Nous access token. In + that case the runtime resolves the agent's own Nous access token (the same ``resolve_nous_access_token()`` the enroll CLI / dashboard register use), POSTs ``/relay/provision`` asserting its own endpoint + route keys, and sets ``GATEWAY_RELAY_ID`` / ``GATEWAY_RELAY_SECRET`` / ``GATEWAY_RELAY_DELIVERY_KEY`` into ``os.environ`` so the subsequent ``register_relay_adapter()`` picks them - up. The creds live ONLY in process memory — never written to ``~/.hermes/.env`` - (``save_env_value`` refuses under managed anyway, and keeping the secret off - any volume is the stronger posture). + up. The creds live ONLY in process memory — never written to ``~/.hermes/.env``. - Stateless: process-env creds don't survive a restart, so a managed container + The trigger is deliberately NOT ``is_managed()``: that means + "package-manager/NixOS-managed" and is False on a NAS-hosted Fly agent (which + sets neither ``HERMES_MANAGED`` nor a ``.managed`` marker), so gating on it + blocked the exact hosted case this is for. The real signal is "you pointed me + at a connector and didn't pin a secret" — which is both NAS-independent and + self-guarding: + + - A NAS-hosted agent: has ``GATEWAY_RELAY_URL``, no pinned secret, and a + bootstrapped NAS token -> self-provisions. + - A self-hosted operator who ran ``hermes gateway enroll``: has a PINNED + ``GATEWAY_RELAY_SECRET`` -> skipped (the secret-present guard below). + - A self-hosted box with a relay URL but no NAS identity: + ``resolve_nous_access_token()`` fails -> graceful no-op. + + Stateless: process-env creds don't survive a restart, so a hosted container re-provisions every boot; the connector's rotation window covers a still- connected prior instance. An explicitly-pinned ``GATEWAY_RELAY_SECRET`` (env or config) is RESPECTED — self-provision skips so an operator pin isn't @@ -233,18 +245,12 @@ def self_provision_if_managed() -> bool: logger = logging.getLogger("gateway.relay") - try: - from hermes_cli.config import is_managed - except Exception: # noqa: BLE001 - return False - - if not is_managed(): - return False dial_url = relay_url() if not dial_url: return False - # Respect an already-present (pinned/stamped) secret — don't stomp it. + # Respect an already-present (pinned/stamped) secret — don't stomp it. This + # is also what makes a self-hosted, enrolled gateway skip self-provision. existing_id, existing_secret = relay_connection_auth() if existing_id and existing_secret: logger.info("relay self-provision skipped: GATEWAY_RELAY_SECRET already set") @@ -255,6 +261,8 @@ def self_provision_if_managed() -> bool: access_token = resolve_nous_access_token() except Exception as exc: # noqa: BLE001 - boot must survive a token failure + # No resolvable NAS identity (e.g. a self-hosted box that hasn't enrolled) + # -> nothing to provision with; skip quietly and let the gateway boot. logger.warning("relay self-provision skipped: could not resolve Nous token (%s)", exc) return False diff --git a/gateway/run.py b/gateway/run.py index 8f139341793..e24afd035e7 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5119,14 +5119,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew from gateway.relay import ( register_relay_adapter, relay_url, - self_provision_if_managed, + self_provision_relay, ) - # Managed boot: self-provision relay creds in-process (resolve the - # agent's NAS token -> POST /relay/provision -> set GATEWAY_RELAY_* in - # os.environ) BEFORE registration reads them. No-op when not managed, - # relay unconfigured, or a secret is already pinned. Never raises. - self_provision_if_managed() + # Boot-time relay self-provision: resolve the agent's NAS token -> + # POST /relay/provision -> set GATEWAY_RELAY_* in os.environ BEFORE + # registration reads them. No-op when relay is unconfigured, a secret + # is already pinned, or no NAS token resolves (self-hosted, unenrolled). + # Never raises. + self_provision_relay() if register_relay_adapter(): logger.info("relay adapter registered (connector at %s)", relay_url()) diff --git a/tests/gateway/relay/test_self_provision.py b/tests/gateway/relay/test_self_provision.py index 7a379eb5c3b..c5af66f94ef 100644 --- a/tests/gateway/relay/test_self_provision.py +++ b/tests/gateway/relay/test_self_provision.py @@ -1,9 +1,13 @@ -"""Unit tests for managed-boot relay self-provisioning. +"""Unit tests for boot-time relay self-provisioning. -Covers gateway.relay.self_provision_if_managed() + the relay_endpoint() / +Covers gateway.relay.self_provision_relay() + the relay_endpoint() / relay_route_keys() config readers. The connector HTTP POST is monkeypatched (the cross-repo E2E exercises the real /relay/provision); these prove the TRIGGER logic, in-process env wiring, and fail-soft boot behaviour. + +The trigger is deliberately NOT is_managed() (that means NixOS/package-manager- +managed, which is False on a NAS-hosted Fly agent). The real gate is +"relay_url set + no pinned secret + a resolvable NAS token". """ from __future__ import annotations @@ -48,8 +52,13 @@ def _stub_post(captured: dict): return _fake -def _arm(monkeypatch, *, managed=True, url="wss://connector.example/relay", token="nas-token"): - monkeypatch.setattr("hermes_cli.config.is_managed", lambda: managed) +def _arm(monkeypatch, *, url="wss://connector.example/relay", token="nas-token"): + """Arm the real trigger: a relay URL + a resolvable NAS token. + + Note there is intentionally no `managed` knob — self-provision no longer + consults is_managed(). A test that wants the "no NAS identity" branch + monkeypatches resolve_nous_access_token to raise instead. + """ monkeypatch.setattr(relay, "relay_url", lambda: url) monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", lambda: token) @@ -82,29 +91,37 @@ def test_provision_url_maps_ws_to_http(): # ─────────────────────────── trigger logic ─────────────────────────── -def test_skips_when_not_managed(monkeypatch): - _arm(monkeypatch, managed=False) - called = {"n": 0} - monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {}) - assert relay.self_provision_if_managed() is False - assert called["n"] == 0 +def test_provisions_on_nas_host_that_is_NOT_is_managed(monkeypatch): + """Regression: a NAS-hosted Fly agent sets neither HERMES_MANAGED nor a + .managed marker, so is_managed() is False. Self-provision must STILL fire — + the old is_managed() gate silently no-oped exactly this case in staging. + """ + # Force is_managed() False to model a real hosted agent; it must be irrelevant. + monkeypatch.setattr("hermes_cli.config.is_managed", lambda: False) + _arm(monkeypatch) + captured: dict = {} + monkeypatch.setattr(relay, "_post_provision", _stub_post(captured)) + + assert relay.self_provision_relay() is True + assert relay.relay_connection_auth()[1] == "a" * 64 def test_skips_when_relay_not_configured(monkeypatch): _arm(monkeypatch, url=None) called = {"n": 0} monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {}) - assert relay.self_provision_if_managed() is False + assert relay.self_provision_relay() is False assert called["n"] == 0 def test_skips_when_secret_already_pinned(monkeypatch): + """A self-hosted, enrolled gateway has a pinned secret -> never self-provisions.""" _arm(monkeypatch) monkeypatch.setenv("GATEWAY_RELAY_ID", "gw-pinned") monkeypatch.setenv("GATEWAY_RELAY_SECRET", "deadbeef") called = {"n": 0} monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {}) - assert relay.self_provision_if_managed() is False + assert relay.self_provision_relay() is False assert called["n"] == 0 # The pinned secret is untouched. assert relay.relay_connection_auth() == ("gw-pinned", "deadbeef") @@ -119,7 +136,7 @@ def test_provisions_and_sets_env_in_process(monkeypatch): captured: dict = {} monkeypatch.setattr(relay, "_post_provision", _stub_post(captured)) - assert relay.self_provision_if_managed() is True + assert relay.self_provision_relay() is True # The connector POST carried the gateway-asserted endpoint + route keys. assert captured["provision_url"] == "https://connector.example/relay/provision" assert captured["access_token"] == "nas-token" @@ -138,7 +155,7 @@ def test_outbound_only_when_no_endpoint(monkeypatch): captured: dict = {} monkeypatch.setattr(relay, "_post_provision", _stub_post(captured)) - assert relay.self_provision_if_managed() is True + assert relay.self_provision_relay() is True assert captured["gateway_endpoint"] is None assert captured["route_keys"] == [] assert relay.relay_connection_auth()[1] == "a" * 64 @@ -146,15 +163,18 @@ def test_outbound_only_when_no_endpoint(monkeypatch): # ─────────────────────────── fail-soft ─────────────────────────── -def test_token_failure_is_non_fatal(monkeypatch): - _arm(monkeypatch) +def test_no_nas_token_is_non_fatal(monkeypatch): + """A self-hosted box with a relay URL but no resolvable NAS identity skips + quietly (this is the branch that replaces the old is_managed() gate for the + non-NAS case).""" + monkeypatch.setattr(relay, "relay_url", lambda: "wss://connector.example/relay") def _boom(): raise RuntimeError("no token") monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", _boom) # Must not raise; returns False; no creds set. - assert relay.self_provision_if_managed() is False + assert relay.self_provision_relay() is False assert relay.relay_connection_auth() == (None, None) @@ -165,5 +185,5 @@ def test_connector_failure_is_non_fatal(monkeypatch): raise RuntimeError("connector returned HTTP 503") monkeypatch.setattr(relay, "_post_provision", _boom) - assert relay.self_provision_if_managed() is False + assert relay.self_provision_relay() is False assert relay.relay_connection_auth() == (None, None)