fix(gateway): scale-to-zero never armed — arm-gate counted disabled placeholder platforms (#52831)

The scale-to-zero idle watcher never started on a correctly-opted-in,
relay-only instance, so the gateway never ran its idle decision, never called
go_dormant(), and never sent going_idle to the connector. Fly's autostop still
suspended the machine on traffic-idle, but the connector never flipped the
instance to buffered-only — so an inbound DM took the live delivery path,
found no live session for the suspended machine, and was dropped fail-closed
with no wake poke. The machine slept and never woke.

Root cause: _scale_to_zero_should_arm() passed list(config.platforms.keys())
to messaging_is_relay_only_or_absent(). config.platforms is pre-seeded with a
DISABLED placeholder PlatformConfig for every known platform (telegram,
discord, slack, matrix, …), so the key set is always the full ~20-entry
catalog regardless of what the instance actually runs. The relay-only check
discarded "relay", saw the disabled placeholders as live direct-socket
platforms, and returned False — so should_arm() was False and the watcher was
never created. Verified live on a staging instance: config.platforms keys =
[telegram, discord, slack, mattermost, matrix, relay] with only relay
enabled=True; should_arm() = False.

Fix: filter config.platforms to ENABLED entries before the relay-only check,
mirroring the adapter-connect loop which already gates on
`if not platform_config.enabled: continue`. This arms off the same notion of
"active platform" the rest of start() already uses — no parallel concept.

Also add a one-line not-armed diagnostic: when an instance IS opted in (the
HERMES_SCALE_TO_ZERO stamp is set) but the watcher still doesn't arm, log why
(relay_only_or_absent, the enabled platforms, wake_url present/missing). A
non-opted instance stays silent. The arm path previously logged only on
success, so a failed arm was invisible.

Tests: the existing pure-helper tests passed bare names so they never
exercised the call site that feeds the placeholder-laden config. Add
behaviour-contract tests against the REAL _scale_to_zero_should_arm with a
realistic config.platforms (relay enabled + others disabled). The F25
regression test (relay-only + disabled placeholders must arm) and the
no-platform case are RED without this fix, GREEN with it; the
genuinely-enabled-direct-platform / not-opted-in / no-wake-url cases stay
correctly non-arming so the filter can't over-broaden.

Wake mechanism itself verified healthy independently (direct wakeUrl GET
resumed a suspended staging instance in 1.15s, clean resume signature).
This commit is contained in:
Ben Barclay 2026-06-26 14:01:48 +10:00 committed by GitHub
parent a4091e49f1
commit dedf5643d8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 151 additions and 1 deletions

View file

@ -3637,7 +3637,20 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
)
try:
platforms = list(self.config.platforms.keys()) if self.config else []
# Only ENABLED platforms count. `config.platforms` is pre-seeded with a
# disabled placeholder PlatformConfig for every KNOWN platform (telegram,
# discord, slack, …), so `.keys()` is the full ~20-entry catalog regardless
# of what this instance actually runs. Passing the bare keys made
# `messaging_is_relay_only_or_absent` see those placeholders as live
# direct-socket platforms and return False, so scale-to-zero NEVER armed on
# a real relay-only instance. Mirror the connect loop, which already gates on
# `platform_config.enabled` (see the `if not platform_config.enabled: continue`
# in the adapter-connect loop) — arm off the same notion of "active platform."
platforms = (
[p for p, pc in self.config.platforms.items() if getattr(pc, "enabled", False)]
if self.config
else []
)
except Exception: # noqa: BLE001
platforms = []
try:
@ -3650,6 +3663,52 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
wake_url=wake_url,
)
def _log_scale_to_zero_not_armed_reason(self) -> None:
"""Log why the idle watcher did NOT arm — but only for an OPTED-IN instance.
A non-opted instance (no HERMES_SCALE_TO_ZERO stamp) not arming is the normal
case and must stay silent. When the Labs stamp IS set but the watcher still
didn't arm, that's the surprising case worth one INFO line so "why won't it
suspend/wake?" is a log grep, not a box-dive.
"""
from gateway.relay import relay_wake_url
from gateway.scale_to_zero import (
messaging_is_relay_only_or_absent,
scale_to_zero_enabled,
)
try:
enabled = scale_to_zero_enabled()
if not enabled:
return # not opted in — normal, stay quiet
try:
active = (
[
getattr(p, "value", p)
for p, pc in self.config.platforms.items()
if getattr(pc, "enabled", False)
]
if self.config
else []
)
except Exception: # noqa: BLE001
active = []
relay_only = messaging_is_relay_only_or_absent(active)
try:
wake_url = relay_wake_url()
except Exception: # noqa: BLE001
wake_url = None
logger.info(
"scale-to-zero: NOT armed despite opt-in — "
"relay_only_or_absent=%s (enabled platforms=%s), wake_url=%s. "
"Need relay-only messaging + a registered wake URL.",
relay_only,
active or "none",
"set" if wake_url else "MISSING",
)
except Exception: # noqa: BLE001 - diagnostics must never block startup
logger.debug("scale-to-zero: not-armed reason logging failed", exc_info=True)
def _scale_to_zero_is_idle(self) -> bool:
from gateway.scale_to_zero import is_idle
@ -6150,6 +6209,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._scale_to_zero_idle_timeout_seconds(),
)
asyncio.create_task(self._scale_to_zero_watcher())
else:
# Surface WHY an OPTED-IN instance didn't arm (a non-opted instance
# not arming is normal — stay silent there). Without this, a failed
# arm is invisible and "why won't it suspend/wake?" needs a box-dive.
self._log_scale_to_zero_not_armed_reason()
except Exception: # noqa: BLE001 - arming must never block startup
logger.debug("scale-to-zero: arm check failed at startup", exc_info=True)

View file

@ -149,3 +149,89 @@ def test_bg_work_false_when_quiet():
r._background_tasks = set()
# No background tasks, no active processes in this fresh process.
assert r._scale_to_zero_has_live_background_work() is False
# ── _scale_to_zero_should_arm: the CALL SITE feeds config.platforms (the F25 bug) ──
#
# config.platforms is pre-seeded with a DISABLED placeholder PlatformConfig for every
# known platform, so list(config.platforms.keys()) is always the full ~20-entry catalog
# regardless of what the instance runs. The arm check must filter to ENABLED platforms
# (mirroring the connect loop) before asking messaging_is_relay_only_or_absent — passing
# the bare placeholder keys made it see disabled `discord`/`telegram`/… as live direct
# platforms and refuse to arm on a real relay-only instance. The pure-helper tests in
# test_scale_to_zero.py pass bare names so they never exercised this call site.
def _arm_runner(monkeypatch, platform_states, *, enabled=True, wake_url="https://wake.example"):
"""Build a GatewayRunner stand-in whose config.platforms mirrors a real load:
`platform_states` is {Platform: enabled_bool}; everything runs the REAL
_scale_to_zero_should_arm. Only the env flag + wake_url resolution are stubbed."""
from types import SimpleNamespace
from gateway.config import PlatformConfig
r = GatewayRunner.__new__(GatewayRunner)
platforms = {p: PlatformConfig(enabled=en) for p, en in platform_states.items()}
r.config = SimpleNamespace(platforms=platforms)
monkeypatch.setattr("gateway.scale_to_zero.scale_to_zero_enabled", lambda *a, **k: enabled)
monkeypatch.setattr("gateway.relay.relay_wake_url", lambda: wake_url)
return r
def test_arm_true_for_relay_only_with_disabled_placeholders(monkeypatch):
"""The F25 regression test: relay ENABLED, every other platform present but
DISABLED (the real load_gateway_config() shape). Must arm the disabled
placeholders must NOT count as live direct-socket platforms."""
from gateway.platforms.base import Platform
r = _arm_runner(
monkeypatch,
{
Platform.TELEGRAM: False,
Platform.DISCORD: False,
Platform.SLACK: False,
Platform.MATRIX: False,
Platform.RELAY: True,
},
)
assert r._scale_to_zero_should_arm() is True
def test_no_arm_when_a_direct_platform_is_actually_enabled(monkeypatch):
"""A genuinely-enabled direct-socket platform (real Discord token) DOES disarm —
the filter must not over-broaden to 'ignore everything but relay'."""
from gateway.platforms.base import Platform
r = _arm_runner(
monkeypatch,
{Platform.DISCORD: True, Platform.RELAY: True},
)
assert r._scale_to_zero_should_arm() is False
def test_arm_when_no_platform_enabled_at_all(monkeypatch):
"""Chronos-only / no-messaging agent (all placeholders disabled) can scale to zero."""
from gateway.platforms.base import Platform
r = _arm_runner(
monkeypatch,
{Platform.TELEGRAM: False, Platform.DISCORD: False},
)
assert r._scale_to_zero_should_arm() is True
def test_no_arm_when_not_opted_in(monkeypatch):
"""Relay-only but the Labs stamp is off ⇒ never arm (fail-safe default)."""
from gateway.platforms.base import Platform
r = _arm_runner(monkeypatch, {Platform.RELAY: True}, enabled=False)
assert r._scale_to_zero_should_arm() is False
def test_no_arm_without_wake_url(monkeypatch):
"""Relay-only + opted in but no registered wake URL ⇒ no arm (§3.4(1))."""
from gateway.platforms.base import Platform
r = _arm_runner(monkeypatch, {Platform.RELAY: True}, wake_url=None)
assert r._scale_to_zero_should_arm() is False