From dedf5643d89b143fb573d51b9edd201db6479c3c Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Fri, 26 Jun 2026 14:01:48 +1000 Subject: [PATCH] =?UTF-8?q?fix(gateway):=20scale-to-zero=20never=20armed?= =?UTF-8?q?=20=E2=80=94=20arm-gate=20counted=20disabled=20placeholder=20pl?= =?UTF-8?q?atforms=20(#52831)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scale-to-zero idle watcher never started on a correctly-opted-in, relay-only instance, so the gateway never ran its idle decision, never called go_dormant(), and never sent going_idle to the connector. Fly's autostop still suspended the machine on traffic-idle, but the connector never flipped the instance to buffered-only — so an inbound DM took the live delivery path, found no live session for the suspended machine, and was dropped fail-closed with no wake poke. The machine slept and never woke. Root cause: _scale_to_zero_should_arm() passed list(config.platforms.keys()) to messaging_is_relay_only_or_absent(). config.platforms is pre-seeded with a DISABLED placeholder PlatformConfig for every known platform (telegram, discord, slack, matrix, …), so the key set is always the full ~20-entry catalog regardless of what the instance actually runs. The relay-only check discarded "relay", saw the disabled placeholders as live direct-socket platforms, and returned False — so should_arm() was False and the watcher was never created. Verified live on a staging instance: config.platforms keys = [telegram, discord, slack, mattermost, matrix, relay] with only relay enabled=True; should_arm() = False. Fix: filter config.platforms to ENABLED entries before the relay-only check, mirroring the adapter-connect loop which already gates on `if not platform_config.enabled: continue`. This arms off the same notion of "active platform" the rest of start() already uses — no parallel concept. Also add a one-line not-armed diagnostic: when an instance IS opted in (the HERMES_SCALE_TO_ZERO stamp is set) but the watcher still doesn't arm, log why (relay_only_or_absent, the enabled platforms, wake_url present/missing). A non-opted instance stays silent. The arm path previously logged only on success, so a failed arm was invisible. Tests: the existing pure-helper tests passed bare names so they never exercised the call site that feeds the placeholder-laden config. Add behaviour-contract tests against the REAL _scale_to_zero_should_arm with a realistic config.platforms (relay enabled + others disabled). The F25 regression test (relay-only + disabled placeholders must arm) and the no-platform case are RED without this fix, GREEN with it; the genuinely-enabled-direct-platform / not-opted-in / no-wake-url cases stay correctly non-arming so the filter can't over-broaden. Wake mechanism itself verified healthy independently (direct wakeUrl GET resumed a suspended staging instance in 1.15s, clean resume signature). --- gateway/run.py | 66 +++++++++++++++- tests/gateway/test_scale_to_zero_watcher.py | 86 +++++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 8555480d7b7..5b3c2c88f36 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3637,7 +3637,20 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew ) try: - platforms = list(self.config.platforms.keys()) if self.config else [] + # Only ENABLED platforms count. `config.platforms` is pre-seeded with a + # disabled placeholder PlatformConfig for every KNOWN platform (telegram, + # discord, slack, …), so `.keys()` is the full ~20-entry catalog regardless + # of what this instance actually runs. Passing the bare keys made + # `messaging_is_relay_only_or_absent` see those placeholders as live + # direct-socket platforms and return False, so scale-to-zero NEVER armed on + # a real relay-only instance. Mirror the connect loop, which already gates on + # `platform_config.enabled` (see the `if not platform_config.enabled: continue` + # in the adapter-connect loop) — arm off the same notion of "active platform." + platforms = ( + [p for p, pc in self.config.platforms.items() if getattr(pc, "enabled", False)] + if self.config + else [] + ) except Exception: # noqa: BLE001 platforms = [] try: @@ -3650,6 +3663,52 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew wake_url=wake_url, ) + def _log_scale_to_zero_not_armed_reason(self) -> None: + """Log why the idle watcher did NOT arm — but only for an OPTED-IN instance. + + A non-opted instance (no HERMES_SCALE_TO_ZERO stamp) not arming is the normal + case and must stay silent. When the Labs stamp IS set but the watcher still + didn't arm, that's the surprising case worth one INFO line so "why won't it + suspend/wake?" is a log grep, not a box-dive. + """ + from gateway.relay import relay_wake_url + from gateway.scale_to_zero import ( + messaging_is_relay_only_or_absent, + scale_to_zero_enabled, + ) + + try: + enabled = scale_to_zero_enabled() + if not enabled: + return # not opted in — normal, stay quiet + try: + active = ( + [ + getattr(p, "value", p) + for p, pc in self.config.platforms.items() + if getattr(pc, "enabled", False) + ] + if self.config + else [] + ) + except Exception: # noqa: BLE001 + active = [] + relay_only = messaging_is_relay_only_or_absent(active) + try: + wake_url = relay_wake_url() + except Exception: # noqa: BLE001 + wake_url = None + logger.info( + "scale-to-zero: NOT armed despite opt-in — " + "relay_only_or_absent=%s (enabled platforms=%s), wake_url=%s. " + "Need relay-only messaging + a registered wake URL.", + relay_only, + active or "none", + "set" if wake_url else "MISSING", + ) + except Exception: # noqa: BLE001 - diagnostics must never block startup + logger.debug("scale-to-zero: not-armed reason logging failed", exc_info=True) + def _scale_to_zero_is_idle(self) -> bool: from gateway.scale_to_zero import is_idle @@ -6150,6 +6209,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._scale_to_zero_idle_timeout_seconds(), ) asyncio.create_task(self._scale_to_zero_watcher()) + else: + # Surface WHY an OPTED-IN instance didn't arm (a non-opted instance + # not arming is normal — stay silent there). Without this, a failed + # arm is invisible and "why won't it suspend/wake?" needs a box-dive. + self._log_scale_to_zero_not_armed_reason() except Exception: # noqa: BLE001 - arming must never block startup logger.debug("scale-to-zero: arm check failed at startup", exc_info=True) diff --git a/tests/gateway/test_scale_to_zero_watcher.py b/tests/gateway/test_scale_to_zero_watcher.py index bb959e810c6..a48c64c67ae 100644 --- a/tests/gateway/test_scale_to_zero_watcher.py +++ b/tests/gateway/test_scale_to_zero_watcher.py @@ -149,3 +149,89 @@ def test_bg_work_false_when_quiet(): r._background_tasks = set() # No background tasks, no active processes in this fresh process. assert r._scale_to_zero_has_live_background_work() is False + + +# ── _scale_to_zero_should_arm: the CALL SITE feeds config.platforms (the F25 bug) ── +# +# config.platforms is pre-seeded with a DISABLED placeholder PlatformConfig for every +# known platform, so list(config.platforms.keys()) is always the full ~20-entry catalog +# regardless of what the instance runs. The arm check must filter to ENABLED platforms +# (mirroring the connect loop) before asking messaging_is_relay_only_or_absent — passing +# the bare placeholder keys made it see disabled `discord`/`telegram`/… as live direct +# platforms and refuse to arm on a real relay-only instance. The pure-helper tests in +# test_scale_to_zero.py pass bare names so they never exercised this call site. + + +def _arm_runner(monkeypatch, platform_states, *, enabled=True, wake_url="https://wake.example"): + """Build a GatewayRunner stand-in whose config.platforms mirrors a real load: + `platform_states` is {Platform: enabled_bool}; everything runs the REAL + _scale_to_zero_should_arm. Only the env flag + wake_url resolution are stubbed.""" + from types import SimpleNamespace + + from gateway.config import PlatformConfig + + r = GatewayRunner.__new__(GatewayRunner) + platforms = {p: PlatformConfig(enabled=en) for p, en in platform_states.items()} + r.config = SimpleNamespace(platforms=platforms) + + monkeypatch.setattr("gateway.scale_to_zero.scale_to_zero_enabled", lambda *a, **k: enabled) + monkeypatch.setattr("gateway.relay.relay_wake_url", lambda: wake_url) + return r + + +def test_arm_true_for_relay_only_with_disabled_placeholders(monkeypatch): + """The F25 regression test: relay ENABLED, every other platform present but + DISABLED (the real load_gateway_config() shape). Must arm — the disabled + placeholders must NOT count as live direct-socket platforms.""" + from gateway.platforms.base import Platform + + r = _arm_runner( + monkeypatch, + { + Platform.TELEGRAM: False, + Platform.DISCORD: False, + Platform.SLACK: False, + Platform.MATRIX: False, + Platform.RELAY: True, + }, + ) + assert r._scale_to_zero_should_arm() is True + + +def test_no_arm_when_a_direct_platform_is_actually_enabled(monkeypatch): + """A genuinely-enabled direct-socket platform (real Discord token) DOES disarm — + the filter must not over-broaden to 'ignore everything but relay'.""" + from gateway.platforms.base import Platform + + r = _arm_runner( + monkeypatch, + {Platform.DISCORD: True, Platform.RELAY: True}, + ) + assert r._scale_to_zero_should_arm() is False + + +def test_arm_when_no_platform_enabled_at_all(monkeypatch): + """Chronos-only / no-messaging agent (all placeholders disabled) can scale to zero.""" + from gateway.platforms.base import Platform + + r = _arm_runner( + monkeypatch, + {Platform.TELEGRAM: False, Platform.DISCORD: False}, + ) + assert r._scale_to_zero_should_arm() is True + + +def test_no_arm_when_not_opted_in(monkeypatch): + """Relay-only but the Labs stamp is off ⇒ never arm (fail-safe default).""" + from gateway.platforms.base import Platform + + r = _arm_runner(monkeypatch, {Platform.RELAY: True}, enabled=False) + assert r._scale_to_zero_should_arm() is False + + +def test_no_arm_without_wake_url(monkeypatch): + """Relay-only + opted in but no registered wake URL ⇒ no arm (§3.4(1)).""" + from gateway.platforms.base import Platform + + r = _arm_runner(monkeypatch, {Platform.RELAY: True}, wake_url=None) + assert r._scale_to_zero_should_arm() is False