From 75a70d98f322378b978695f832813af9c05ced83 Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Mon, 22 Jun 2026 21:46:59 +1000 Subject: [PATCH] =?UTF-8?q?feat(relay):=20forward=20a=20stable=20instance?= =?UTF-8?q?=20id=20at=20self-provision=20(Phase=206=20Unit=20=CE=B1)=20(#5?= =?UTF-8?q?0772)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relay_instance_id() (env GATEWAY_RELAY_INSTANCE_ID first, then gateway.relay_instance_id in config.yaml, mirroring the other relay readers) and forward it in the /relay/provision body so the connector can bind gatewayId -> instanceId and route inbound per-instance once Phase 6 delivery lands. The value is gateway-asserted but safely scoped: the org/tenant stays NAS-token-verified at the connector, so a dishonest gateway can only bind its OWN tenant's instance — same posture as relay_endpoint(). instanceId is only added to the body when present, so omitting it lets the connector store null (back-compat: self-hosted / pre-Phase-6 gateways simply have no binding yet). For a managed (NAS-hosted) agent the id is NAS's AgentInstance.id, stamped into the container env beside GATEWAY_RELAY_URL. Tests: reader (env/config/absent), self_provision_relay forwards the id (set + absent), and the real _post_provision body includes instanceId ONLY when set. Refs: ~/nous/specs/gateway-gateway plan.md Phase 6 Unit α; decisions.md Q11. --- gateway/relay/__init__.py | 37 ++++++++- tests/gateway/relay/test_self_provision.py | 94 ++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/gateway/relay/__init__.py b/gateway/relay/__init__.py index 4b3fdda8a8d..5bf237ec1f0 100644 --- a/gateway/relay/__init__.py +++ b/gateway/relay/__init__.py @@ -131,6 +131,33 @@ def relay_route_keys() -> list[str]: return [k.strip() for k in raw.split(",") if k.strip()] +def relay_instance_id() -> Optional[str]: + """Stable per-instance id this gateway forwards at provision (Phase 6 Unit α). + + Binds the connector's ``gatewayId -> instanceId`` so the connector can route + inbound per-instance (not tenant-broadcast) once Phase 6 delivery lands. The + value is the NAS ``AgentInstance.id`` for a managed agent (NAS stamps + ``GATEWAY_RELAY_INSTANCE_ID`` into the container env, beside + ``GATEWAY_RELAY_URL``); a self-hosted operator may set it explicitly. It is + gateway-asserted but safely scoped: the org/tenant stays token-verified, so a + dishonest gateway can only bind ITS OWN tenant's instance — the same posture + as ``relay_endpoint()``. Absent -> the connector stores null and per-instance + routing simply has no binding for this connection yet (back-compat). + + Env first (Docker/NAS), then ``gateway.relay_instance_id`` in config.yaml. + """ + value = os.environ.get("GATEWAY_RELAY_INSTANCE_ID", "").strip() + if not value: + try: + from gateway.run import _load_gateway_config # late import to avoid cycle + + cfg = (_load_gateway_config().get("gateway") or {}) + value = str(cfg.get("relay_instance_id", "") or "").strip() + except Exception: # noqa: BLE001 - config absence/parse must never crash boot + value = "" + return value or None + + def _provision_url(relay_dial_url: str) -> str: """Map the ``ws(s)://…/relay`` dial URL to the ``http(s)://…/relay/provision`` POST URL.""" raw = relay_dial_url.rstrip("/") @@ -152,6 +179,7 @@ def _post_provision( bot_id: str, gateway_endpoint: Optional[str], route_keys: list[str], + instance_id: Optional[str] = None, timeout: float = 15.0, ) -> dict: """POST to the connector's ``/relay/provision`` and return the JSON body. @@ -173,6 +201,10 @@ def _post_provision( "gatewayEndpoint": gateway_endpoint or "", "routeKeys": route_keys, } + # Only send instanceId when we actually have one — omitting it lets the + # connector store null (back-compat) rather than binding an empty string. + if instance_id: + body["instanceId"] = instance_id data = json.dumps(body).encode("utf-8") req = urllib.request.Request( provision_url, @@ -277,6 +309,7 @@ def self_provision_relay() -> bool: gateway_id = os.environ.get("GATEWAY_RELAY_ID", "").strip() or f"gw-{host or 'hermes'}" endpoint = relay_endpoint() route_keys = relay_route_keys() + instance_id = relay_instance_id() try: result = _post_provision( @@ -287,6 +320,7 @@ def self_provision_relay() -> bool: bot_id=bot_id, gateway_endpoint=endpoint, route_keys=route_keys, + instance_id=instance_id, ) except RuntimeError as exc: logger.warning("relay self-provision failed (%s); gateway will boot without relay auth", exc) @@ -302,11 +336,12 @@ def self_provision_relay() -> bool: os.environ["GATEWAY_RELAY_DELIVERY_KEY"] = str(result.get("deliveryKey") or "") tenant = str(result.get("tenant") or "") logger.info( - "relay self-provisioned (gateway_id=%s tenant=%s routes=%d inbound=%s)", + "relay self-provisioned (gateway_id=%s tenant=%s routes=%d inbound=%s instance=%s)", os.environ["GATEWAY_RELAY_ID"], tenant or "?", len(route_keys), "yes" if endpoint else "outbound-only", + instance_id or "unbound", ) return True diff --git a/tests/gateway/relay/test_self_provision.py b/tests/gateway/relay/test_self_provision.py index c5af66f94ef..aad4e176fc5 100644 --- a/tests/gateway/relay/test_self_provision.py +++ b/tests/gateway/relay/test_self_provision.py @@ -30,6 +30,7 @@ def _clean_env(monkeypatch): "GATEWAY_RELAY_ROUTE_KEYS", "GATEWAY_RELAY_PLATFORM", "GATEWAY_RELAY_BOT_ID", + "GATEWAY_RELAY_INSTANCE_ID", ): monkeypatch.delenv(k, raising=False) # Never read config.yaml off disk in these tests. @@ -83,6 +84,24 @@ def test_relay_route_keys_empty(): assert relay.relay_route_keys() == [] +def test_relay_instance_id_from_env(monkeypatch): + monkeypatch.setenv("GATEWAY_RELAY_INSTANCE_ID", " inst-abc ") + assert relay.relay_instance_id() == "inst-abc" + + +def test_relay_instance_id_absent_is_none(): + assert relay.relay_instance_id() is None + + +def test_relay_instance_id_from_config(monkeypatch): + monkeypatch.setattr( + "gateway.run._load_gateway_config", + lambda: {"gateway": {"relay_instance_id": "inst-from-config"}}, + raising=False, + ) + assert relay.relay_instance_id() == "inst-from-config" + + def test_provision_url_maps_ws_to_http(): assert relay._provision_url("wss://c.example/relay") == "https://c.example/relay/provision" assert relay._provision_url("ws://c.example/relay") == "http://c.example/relay/provision" @@ -161,6 +180,81 @@ def test_outbound_only_when_no_endpoint(monkeypatch): assert relay.relay_connection_auth()[1] == "a" * 64 +# ─────────────────── instance-id forwarding (Phase 6 Unit α) ─────────────────── + +def test_forwards_instance_id_to_provision(monkeypatch): + """A managed agent stamped with GATEWAY_RELAY_INSTANCE_ID forwards it to the + connector so it can bind gatewayId -> instanceId (per-instance routing).""" + _arm(monkeypatch) + monkeypatch.setenv("GATEWAY_RELAY_INSTANCE_ID", "inst-abc") + captured: dict = {} + monkeypatch.setattr(relay, "_post_provision", _stub_post(captured)) + + assert relay.self_provision_relay() is True + assert captured["instance_id"] == "inst-abc" + + +def test_instance_id_absent_forwards_none(monkeypatch): + """No stamp (self-hosted / pre-Phase-6) -> instance_id None; the connector + stores null and per-instance routing simply has no binding yet.""" + _arm(monkeypatch) + captured: dict = {} + monkeypatch.setattr(relay, "_post_provision", _stub_post(captured)) + + assert relay.self_provision_relay() is True + assert captured["instance_id"] is None + + +def test_post_provision_body_includes_instanceId_only_when_set(monkeypatch): + """The real _post_provision adds `instanceId` to the JSON body ONLY when a + value is supplied — omitting it lets the connector store null (back-compat), + rather than binding an empty string.""" + import json + + sent: dict = {} + + class _Resp: + def __enter__(self): + return self + + def __exit__(self, *a): + return False + + def read(self): + return json.dumps({"secret": "a" * 64, "deliveryKey": "b" * 64, "tenant": "t", "gatewayId": "gw-1"}).encode() + + def _fake_urlopen(req, timeout=None): # noqa: ANN001 + sent["body"] = json.loads(req.data.decode()) + return _Resp() + + monkeypatch.setattr("urllib.request.urlopen", _fake_urlopen) + + # With an instance id -> present in the body. + relay._post_provision( + provision_url="https://c.example/relay/provision", + access_token="tok", + gateway_id="gw-1", + platform="discord", + bot_id="app", + gateway_endpoint=None, + route_keys=[], + instance_id="inst-abc", + ) + assert sent["body"]["instanceId"] == "inst-abc" + + # Without one -> the key is absent entirely (not "" ). + relay._post_provision( + provision_url="https://c.example/relay/provision", + access_token="tok", + gateway_id="gw-1", + platform="discord", + bot_id="app", + gateway_endpoint=None, + route_keys=[], + ) + assert "instanceId" not in sent["body"] + + # ─────────────────────────── fail-soft ─────────────────────────── def test_no_nas_token_is_non_fatal(monkeypatch):