feat(relay): forward a stable instance id at self-provision (Phase 6 Unit α) (#50772)

Add relay_instance_id() (env GATEWAY_RELAY_INSTANCE_ID first, then
gateway.relay_instance_id in config.yaml, mirroring the other relay readers) and
forward it in the /relay/provision body so the connector can bind
gatewayId -> instanceId and route inbound per-instance once Phase 6 delivery
lands.

The value is gateway-asserted but safely scoped: the org/tenant stays
NAS-token-verified at the connector, so a dishonest gateway can only bind its
OWN tenant's instance — same posture as relay_endpoint(). instanceId is only
added to the body when present, so omitting it lets the connector store null
(back-compat: self-hosted / pre-Phase-6 gateways simply have no binding yet).

For a managed (NAS-hosted) agent the id is NAS's AgentInstance.id, stamped into
the container env beside GATEWAY_RELAY_URL.

Tests: reader (env/config/absent), self_provision_relay forwards the id (set +
absent), and the real _post_provision body includes instanceId ONLY when set.

Refs: ~/nous/specs/gateway-gateway plan.md Phase 6 Unit α; decisions.md Q11.
This commit is contained in:
Ben Barclay 2026-06-22 21:46:59 +10:00 committed by GitHub
parent 065946d84f
commit 75a70d98f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 130 additions and 1 deletions

View file

@ -131,6 +131,33 @@ def relay_route_keys() -> list[str]:
return [k.strip() for k in raw.split(",") if k.strip()]
def relay_instance_id() -> Optional[str]:
"""Stable per-instance id this gateway forwards at provision (Phase 6 Unit α).
Binds the connector's ``gatewayId -> instanceId`` so the connector can route
inbound per-instance (not tenant-broadcast) once Phase 6 delivery lands. The
value is the NAS ``AgentInstance.id`` for a managed agent (NAS stamps
``GATEWAY_RELAY_INSTANCE_ID`` into the container env, beside
``GATEWAY_RELAY_URL``); a self-hosted operator may set it explicitly. It is
gateway-asserted but safely scoped: the org/tenant stays token-verified, so a
dishonest gateway can only bind ITS OWN tenant's instance — the same posture
as ``relay_endpoint()``. Absent -> the connector stores null and per-instance
routing simply has no binding for this connection yet (back-compat).
Env first (Docker/NAS), then ``gateway.relay_instance_id`` in config.yaml.
"""
value = os.environ.get("GATEWAY_RELAY_INSTANCE_ID", "").strip()
if not value:
try:
from gateway.run import _load_gateway_config # late import to avoid cycle
cfg = (_load_gateway_config().get("gateway") or {})
value = str(cfg.get("relay_instance_id", "") or "").strip()
except Exception: # noqa: BLE001 - config absence/parse must never crash boot
value = ""
return value or None
def _provision_url(relay_dial_url: str) -> str:
"""Map the ``ws(s)://…/relay`` dial URL to the ``http(s)://…/relay/provision`` POST URL."""
raw = relay_dial_url.rstrip("/")
@ -152,6 +179,7 @@ def _post_provision(
bot_id: str,
gateway_endpoint: Optional[str],
route_keys: list[str],
instance_id: Optional[str] = None,
timeout: float = 15.0,
) -> dict:
"""POST to the connector's ``/relay/provision`` and return the JSON body.
@ -173,6 +201,10 @@ def _post_provision(
"gatewayEndpoint": gateway_endpoint or "",
"routeKeys": route_keys,
}
# Only send instanceId when we actually have one — omitting it lets the
# connector store null (back-compat) rather than binding an empty string.
if instance_id:
body["instanceId"] = instance_id
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(
provision_url,
@ -277,6 +309,7 @@ def self_provision_relay() -> bool:
gateway_id = os.environ.get("GATEWAY_RELAY_ID", "").strip() or f"gw-{host or 'hermes'}"
endpoint = relay_endpoint()
route_keys = relay_route_keys()
instance_id = relay_instance_id()
try:
result = _post_provision(
@ -287,6 +320,7 @@ def self_provision_relay() -> bool:
bot_id=bot_id,
gateway_endpoint=endpoint,
route_keys=route_keys,
instance_id=instance_id,
)
except RuntimeError as exc:
logger.warning("relay self-provision failed (%s); gateway will boot without relay auth", exc)
@ -302,11 +336,12 @@ def self_provision_relay() -> bool:
os.environ["GATEWAY_RELAY_DELIVERY_KEY"] = str(result.get("deliveryKey") or "")
tenant = str(result.get("tenant") or "")
logger.info(
"relay self-provisioned (gateway_id=%s tenant=%s routes=%d inbound=%s)",
"relay self-provisioned (gateway_id=%s tenant=%s routes=%d inbound=%s instance=%s)",
os.environ["GATEWAY_RELAY_ID"],
tenant or "?",
len(route_keys),
"yes" if endpoint else "outbound-only",
instance_id or "unbound",
)
return True

View file

@ -30,6 +30,7 @@ def _clean_env(monkeypatch):
"GATEWAY_RELAY_ROUTE_KEYS",
"GATEWAY_RELAY_PLATFORM",
"GATEWAY_RELAY_BOT_ID",
"GATEWAY_RELAY_INSTANCE_ID",
):
monkeypatch.delenv(k, raising=False)
# Never read config.yaml off disk in these tests.
@ -83,6 +84,24 @@ def test_relay_route_keys_empty():
assert relay.relay_route_keys() == []
def test_relay_instance_id_from_env(monkeypatch):
monkeypatch.setenv("GATEWAY_RELAY_INSTANCE_ID", " inst-abc ")
assert relay.relay_instance_id() == "inst-abc"
def test_relay_instance_id_absent_is_none():
assert relay.relay_instance_id() is None
def test_relay_instance_id_from_config(monkeypatch):
monkeypatch.setattr(
"gateway.run._load_gateway_config",
lambda: {"gateway": {"relay_instance_id": "inst-from-config"}},
raising=False,
)
assert relay.relay_instance_id() == "inst-from-config"
def test_provision_url_maps_ws_to_http():
assert relay._provision_url("wss://c.example/relay") == "https://c.example/relay/provision"
assert relay._provision_url("ws://c.example/relay") == "http://c.example/relay/provision"
@ -161,6 +180,81 @@ def test_outbound_only_when_no_endpoint(monkeypatch):
assert relay.relay_connection_auth()[1] == "a" * 64
# ─────────────────── instance-id forwarding (Phase 6 Unit α) ───────────────────
def test_forwards_instance_id_to_provision(monkeypatch):
"""A managed agent stamped with GATEWAY_RELAY_INSTANCE_ID forwards it to the
connector so it can bind gatewayId -> instanceId (per-instance routing)."""
_arm(monkeypatch)
monkeypatch.setenv("GATEWAY_RELAY_INSTANCE_ID", "inst-abc")
captured: dict = {}
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
assert relay.self_provision_relay() is True
assert captured["instance_id"] == "inst-abc"
def test_instance_id_absent_forwards_none(monkeypatch):
"""No stamp (self-hosted / pre-Phase-6) -> instance_id None; the connector
stores null and per-instance routing simply has no binding yet."""
_arm(monkeypatch)
captured: dict = {}
monkeypatch.setattr(relay, "_post_provision", _stub_post(captured))
assert relay.self_provision_relay() is True
assert captured["instance_id"] is None
def test_post_provision_body_includes_instanceId_only_when_set(monkeypatch):
"""The real _post_provision adds `instanceId` to the JSON body ONLY when a
value is supplied omitting it lets the connector store null (back-compat),
rather than binding an empty string."""
import json
sent: dict = {}
class _Resp:
def __enter__(self):
return self
def __exit__(self, *a):
return False
def read(self):
return json.dumps({"secret": "a" * 64, "deliveryKey": "b" * 64, "tenant": "t", "gatewayId": "gw-1"}).encode()
def _fake_urlopen(req, timeout=None): # noqa: ANN001
sent["body"] = json.loads(req.data.decode())
return _Resp()
monkeypatch.setattr("urllib.request.urlopen", _fake_urlopen)
# With an instance id -> present in the body.
relay._post_provision(
provision_url="https://c.example/relay/provision",
access_token="tok",
gateway_id="gw-1",
platform="discord",
bot_id="app",
gateway_endpoint=None,
route_keys=[],
instance_id="inst-abc",
)
assert sent["body"]["instanceId"] == "inst-abc"
# Without one -> the key is absent entirely (not "" ).
relay._post_provision(
provision_url="https://c.example/relay/provision",
access_token="tok",
gateway_id="gw-1",
platform="discord",
bot_id="app",
gateway_endpoint=None,
route_keys=[],
)
assert "instanceId" not in sent["body"]
# ─────────────────────────── fail-soft ───────────────────────────
def test_no_nas_token_is_non_fatal(monkeypatch):