mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-20 10:11:58 +00:00
fix(relay): trigger self-provision on relay-config + NAS token, not is_managed() (#48724)
self_provision_if_managed() gated on is_managed(), but is_managed() means
"NixOS/package-manager-managed" (it keys on HERMES_MANAGED or a ~/.hermes/.managed
marker) — NOT "NAS-hosted". A NAS-provisioned Fly agent sets NEITHER, so the gate
was always False and relay self-provision SILENTLY no-oped on exactly the hosted
agents it was built for. Caught live: a staging agent with GATEWAY_RELAY_URL
correctly stamped logged "No messaging platforms enabled" and never dialed the
connector; HERMES_MANAGED was unset on the machine. The unit tests had mocked
is_managed()->True, so they passed while the real trigger never fired (mocked-
trigger blind spot).
Fix: drop the is_managed() gate and rename self_provision_if_managed ->
self_provision_relay. The real trigger is now "relay_url() set + no pinned secret
+ a resolvable NAS token", which is both NAS-independent and self-guarding:
- NAS-hosted agent: GATEWAY_RELAY_URL + no pinned secret + bootstrapped NAS
token -> self-provisions.
- Self-hosted + `hermes gateway enroll`: pinned GATEWAY_RELAY_SECRET -> skipped
(existing secret-present guard).
- Self-hosted, unenrolled, no NAS identity: resolve_nous_access_token() fails
-> graceful no-op (existing fail-soft path).
Security: unchanged trust model. The connector still derives tenant from the
validated NAS token; this only broadens WHEN the provision attempt fires, and
every broadened case is still guarded by token-resolution + pinned-secret-skip.
Tests: replaced the (wrong) "skips when not managed" test with a regression test
proving a NAS host where is_managed()==False STILL provisions; renamed all call
sites; added a "no NAS token -> non-fatal skip" test for the self-hosted branch.
88 relay tests pass.
Relay-adapter lane. EXPERIMENTAL.
This commit is contained in:
parent
36851fa576
commit
2c6e266e88
3 changed files with 70 additions and 41 deletions
|
|
@ -204,21 +204,33 @@ def _post_provision(
|
|||
return payload
|
||||
|
||||
|
||||
def self_provision_if_managed() -> bool:
|
||||
"""Managed-boot self-provision: mint relay creds in-process, no human, no disk.
|
||||
def self_provision_relay() -> bool:
|
||||
"""Boot-time relay self-provision: mint relay creds in-process, no human, no disk.
|
||||
|
||||
Fires only on a MANAGED boot (``is_managed()``) with relay configured
|
||||
(``relay_url()`` set) and NO per-gateway secret already present. In that case
|
||||
the runtime resolves the agent's own Nous access token (the same
|
||||
Fires when relay is configured (``relay_url()`` set) and NO per-gateway secret
|
||||
is already present, AND the agent can resolve its own Nous access token. In
|
||||
that case the runtime resolves the agent's own Nous access token (the same
|
||||
``resolve_nous_access_token()`` the enroll CLI / dashboard register use),
|
||||
POSTs ``/relay/provision`` asserting its own endpoint + route keys, and sets
|
||||
``GATEWAY_RELAY_ID`` / ``GATEWAY_RELAY_SECRET`` / ``GATEWAY_RELAY_DELIVERY_KEY``
|
||||
into ``os.environ`` so the subsequent ``register_relay_adapter()`` picks them
|
||||
up. The creds live ONLY in process memory — never written to ``~/.hermes/.env``
|
||||
(``save_env_value`` refuses under managed anyway, and keeping the secret off
|
||||
any volume is the stronger posture).
|
||||
up. The creds live ONLY in process memory — never written to ``~/.hermes/.env``.
|
||||
|
||||
Stateless: process-env creds don't survive a restart, so a managed container
|
||||
The trigger is deliberately NOT ``is_managed()``: that means
|
||||
"package-manager/NixOS-managed" and is False on a NAS-hosted Fly agent (which
|
||||
sets neither ``HERMES_MANAGED`` nor a ``.managed`` marker), so gating on it
|
||||
blocked the exact hosted case this is for. The real signal is "you pointed me
|
||||
at a connector and didn't pin a secret" — which is both NAS-independent and
|
||||
self-guarding:
|
||||
|
||||
- A NAS-hosted agent: has ``GATEWAY_RELAY_URL``, no pinned secret, and a
|
||||
bootstrapped NAS token -> self-provisions.
|
||||
- A self-hosted operator who ran ``hermes gateway enroll``: has a PINNED
|
||||
``GATEWAY_RELAY_SECRET`` -> skipped (the secret-present guard below).
|
||||
- A self-hosted box with a relay URL but no NAS identity:
|
||||
``resolve_nous_access_token()`` fails -> graceful no-op.
|
||||
|
||||
Stateless: process-env creds don't survive a restart, so a hosted container
|
||||
re-provisions every boot; the connector's rotation window covers a still-
|
||||
connected prior instance. An explicitly-pinned ``GATEWAY_RELAY_SECRET`` (env
|
||||
or config) is RESPECTED — self-provision skips so an operator pin isn't
|
||||
|
|
@ -233,18 +245,12 @@ def self_provision_if_managed() -> bool:
|
|||
|
||||
logger = logging.getLogger("gateway.relay")
|
||||
|
||||
try:
|
||||
from hermes_cli.config import is_managed
|
||||
except Exception: # noqa: BLE001
|
||||
return False
|
||||
|
||||
if not is_managed():
|
||||
return False
|
||||
dial_url = relay_url()
|
||||
if not dial_url:
|
||||
return False
|
||||
|
||||
# Respect an already-present (pinned/stamped) secret — don't stomp it.
|
||||
# Respect an already-present (pinned/stamped) secret — don't stomp it. This
|
||||
# is also what makes a self-hosted, enrolled gateway skip self-provision.
|
||||
existing_id, existing_secret = relay_connection_auth()
|
||||
if existing_id and existing_secret:
|
||||
logger.info("relay self-provision skipped: GATEWAY_RELAY_SECRET already set")
|
||||
|
|
@ -255,6 +261,8 @@ def self_provision_if_managed() -> bool:
|
|||
|
||||
access_token = resolve_nous_access_token()
|
||||
except Exception as exc: # noqa: BLE001 - boot must survive a token failure
|
||||
# No resolvable NAS identity (e.g. a self-hosted box that hasn't enrolled)
|
||||
# -> nothing to provision with; skip quietly and let the gateway boot.
|
||||
logger.warning("relay self-provision skipped: could not resolve Nous token (%s)", exc)
|
||||
return False
|
||||
|
||||
|
|
|
|||
|
|
@ -5119,14 +5119,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
from gateway.relay import (
|
||||
register_relay_adapter,
|
||||
relay_url,
|
||||
self_provision_if_managed,
|
||||
self_provision_relay,
|
||||
)
|
||||
|
||||
# Managed boot: self-provision relay creds in-process (resolve the
|
||||
# agent's NAS token -> POST /relay/provision -> set GATEWAY_RELAY_* in
|
||||
# os.environ) BEFORE registration reads them. No-op when not managed,
|
||||
# relay unconfigured, or a secret is already pinned. Never raises.
|
||||
self_provision_if_managed()
|
||||
# Boot-time relay self-provision: resolve the agent's NAS token ->
|
||||
# POST /relay/provision -> set GATEWAY_RELAY_* in os.environ BEFORE
|
||||
# registration reads them. No-op when relay is unconfigured, a secret
|
||||
# is already pinned, or no NAS token resolves (self-hosted, unenrolled).
|
||||
# Never raises.
|
||||
self_provision_relay()
|
||||
|
||||
if register_relay_adapter():
|
||||
logger.info("relay adapter registered (connector at %s)", relay_url())
|
||||
|
|
|
|||
|
|
@ -1,9 +1,13 @@
|
|||
"""Unit tests for managed-boot relay self-provisioning.
|
||||
"""Unit tests for boot-time relay self-provisioning.
|
||||
|
||||
Covers gateway.relay.self_provision_if_managed() + the relay_endpoint() /
|
||||
Covers gateway.relay.self_provision_relay() + the relay_endpoint() /
|
||||
relay_route_keys() config readers. The connector HTTP POST is monkeypatched
|
||||
(the cross-repo E2E exercises the real /relay/provision); these prove the
|
||||
TRIGGER logic, in-process env wiring, and fail-soft boot behaviour.
|
||||
|
||||
The trigger is deliberately NOT is_managed() (that means NixOS/package-manager-
|
||||
managed, which is False on a NAS-hosted Fly agent). The real gate is
|
||||
"relay_url set + no pinned secret + a resolvable NAS token".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -48,8 +52,13 @@ def _stub_post(captured: dict):
|
|||
return _fake
|
||||
|
||||
|
||||
def _arm(monkeypatch, *, managed=True, url="wss://connector.example/relay", token="nas-token"):
|
||||
monkeypatch.setattr("hermes_cli.config.is_managed", lambda: managed)
|
||||
def _arm(monkeypatch, *, url="wss://connector.example/relay", token="nas-token"):
|
||||
"""Arm the real trigger: a relay URL + a resolvable NAS token.
|
||||
|
||||
Note there is intentionally no `managed` knob — self-provision no longer
|
||||
consults is_managed(). A test that wants the "no NAS identity" branch
|
||||
monkeypatches resolve_nous_access_token to raise instead.
|
||||
"""
|
||||
monkeypatch.setattr(relay, "relay_url", lambda: url)
|
||||
monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", lambda: token)
|
||||
|
||||
|
|
@ -82,29 +91,37 @@ def test_provision_url_maps_ws_to_http():
|
|||
|
||||
# ─────────────────────────── trigger logic ───────────────────────────
|
||||
|
||||
def test_skips_when_not_managed(monkeypatch):
|
||||
_arm(monkeypatch, managed=False)
|
||||
called = {"n": 0}
|
||||
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
|
||||
assert relay.self_provision_if_managed() is False
|
||||
assert called["n"] == 0
|
||||
def test_provisions_on_nas_host_that_is_NOT_is_managed(monkeypatch):
|
||||
"""Regression: a NAS-hosted Fly agent sets neither HERMES_MANAGED nor a
|
||||
.managed marker, so is_managed() is False. Self-provision must STILL fire —
|
||||
the old is_managed() gate silently no-oped exactly this case in staging.
|
||||
"""
|
||||
# Force is_managed() False to model a real hosted agent; it must be irrelevant.
|
||||
monkeypatch.setattr("hermes_cli.config.is_managed", lambda: False)
|
||||
_arm(monkeypatch)
|
||||
captured: dict = {}
|
||||
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
|
||||
|
||||
assert relay.self_provision_relay() is True
|
||||
assert relay.relay_connection_auth()[1] == "a" * 64
|
||||
|
||||
|
||||
def test_skips_when_relay_not_configured(monkeypatch):
|
||||
_arm(monkeypatch, url=None)
|
||||
called = {"n": 0}
|
||||
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
|
||||
assert relay.self_provision_if_managed() is False
|
||||
assert relay.self_provision_relay() is False
|
||||
assert called["n"] == 0
|
||||
|
||||
|
||||
def test_skips_when_secret_already_pinned(monkeypatch):
|
||||
"""A self-hosted, enrolled gateway has a pinned secret -> never self-provisions."""
|
||||
_arm(monkeypatch)
|
||||
monkeypatch.setenv("GATEWAY_RELAY_ID", "gw-pinned")
|
||||
monkeypatch.setenv("GATEWAY_RELAY_SECRET", "deadbeef")
|
||||
called = {"n": 0}
|
||||
monkeypatch.setattr(relay, "_post_provision", lambda **k: called.__setitem__("n", called["n"] + 1) or {})
|
||||
assert relay.self_provision_if_managed() is False
|
||||
assert relay.self_provision_relay() is False
|
||||
assert called["n"] == 0
|
||||
# The pinned secret is untouched.
|
||||
assert relay.relay_connection_auth() == ("gw-pinned", "deadbeef")
|
||||
|
|
@ -119,7 +136,7 @@ def test_provisions_and_sets_env_in_process(monkeypatch):
|
|||
captured: dict = {}
|
||||
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
|
||||
|
||||
assert relay.self_provision_if_managed() is True
|
||||
assert relay.self_provision_relay() is True
|
||||
# The connector POST carried the gateway-asserted endpoint + route keys.
|
||||
assert captured["provision_url"] == "https://connector.example/relay/provision"
|
||||
assert captured["access_token"] == "nas-token"
|
||||
|
|
@ -138,7 +155,7 @@ def test_outbound_only_when_no_endpoint(monkeypatch):
|
|||
captured: dict = {}
|
||||
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
|
||||
|
||||
assert relay.self_provision_if_managed() is True
|
||||
assert relay.self_provision_relay() is True
|
||||
assert captured["gateway_endpoint"] is None
|
||||
assert captured["route_keys"] == []
|
||||
assert relay.relay_connection_auth()[1] == "a" * 64
|
||||
|
|
@ -146,15 +163,18 @@ def test_outbound_only_when_no_endpoint(monkeypatch):
|
|||
|
||||
# ─────────────────────────── fail-soft ───────────────────────────
|
||||
|
||||
def test_token_failure_is_non_fatal(monkeypatch):
|
||||
_arm(monkeypatch)
|
||||
def test_no_nas_token_is_non_fatal(monkeypatch):
|
||||
"""A self-hosted box with a relay URL but no resolvable NAS identity skips
|
||||
quietly (this is the branch that replaces the old is_managed() gate for the
|
||||
non-NAS case)."""
|
||||
monkeypatch.setattr(relay, "relay_url", lambda: "wss://connector.example/relay")
|
||||
|
||||
def _boom():
|
||||
raise RuntimeError("no token")
|
||||
|
||||
monkeypatch.setattr("hermes_cli.auth.resolve_nous_access_token", _boom)
|
||||
# Must not raise; returns False; no creds set.
|
||||
assert relay.self_provision_if_managed() is False
|
||||
assert relay.self_provision_relay() is False
|
||||
assert relay.relay_connection_auth() == (None, None)
|
||||
|
||||
|
||||
|
|
@ -165,5 +185,5 @@ def test_connector_failure_is_non_fatal(monkeypatch):
|
|||
raise RuntimeError("connector returned HTTP 503")
|
||||
|
||||
monkeypatch.setattr(relay, "_post_provision", _boom)
|
||||
assert relay.self_provision_if_managed() is False
|
||||
assert relay.self_provision_relay() is False
|
||||
assert relay.relay_connection_auth() == (None, None)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue