fix(gateway): harden scale-to-zero dormancy guards (#52359)
Some checks are pending
CI / detect (push) Waiting to run
CI / tests (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
CI / typecheck (push) Blocked by required conditions
CI / docs-site (push) Blocked by required conditions
CI / history-check (push) Blocked by required conditions
CI / contributor-check (push) Blocked by required conditions
CI / uv-lockfile (push) Blocked by required conditions
CI / docker-lint (push) Blocked by required conditions
CI / supply-chain (push) Blocked by required conditions
CI / osv-scanner (push) Blocked by required conditions
CI / All required checks pass (push) Blocked by required conditions
Deploy Site / deploy-vercel (push) Waiting to run
Deploy Site / deploy-docs (push) Waiting to run
Docker Build and Publish / build-amd64 (push) Waiting to run
Docker Build and Publish / build-arm64 (push) Waiting to run
Docker Build and Publish / merge (push) Blocked by required conditions

Block scale-to-zero suspend while background async delegations are active, and restore runtime status to running on real inbound after a dormant wake.\n\nAdd regression coverage for both review findings.
This commit is contained in:
Ben Barclay 2026-06-25 20:41:03 +10:00 committed by GitHub
parent e62afaca62
commit d6269da7fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 56 additions and 1 deletions

View file

@ -3595,6 +3595,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
"""
if any(not t.done() for t in self._background_tasks):
return True
try:
from tools.async_delegation import active_count
if active_count() > 0:
return True
except Exception: # noqa: BLE001 - never let the idle check raise
logger.debug("scale-to-zero async-delegation check failed", exc_info=True)
try:
from tools.process_registry import process_registry
@ -3653,6 +3660,23 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
has_live_background_work=self._scale_to_zero_has_live_background_work(),
)
def _scale_to_zero_note_real_inbound(self) -> None:
"""Stamp real inbound and restore lifecycle after a dormant wake.
The watcher marks runtime status `draining` as it quiesces the relay, but
dormancy is not the stop/restart drain path: the process remains alive and
should present as running once real traffic wakes it and re-enters the
gateway. Internal completion/replay events intentionally do not call this
helper, so they do not keep an otherwise idle gateway awake.
"""
self._last_inbound_at = time.time()
if getattr(self, "_scale_to_zero_cooldown_until", 0.0) > 0:
try:
self._update_runtime_status("running")
except Exception: # noqa: BLE001 - status restoration is best-effort
logger.debug("scale-to-zero: status restore failed", exc_info=True)
self._scale_to_zero_cooldown_until = 0.0
def _relay_adapter_for_dormancy(self):
"""Return the connected RELAY adapter, if any (the one go_dormant targets)."""
try:
@ -7504,7 +7528,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
# traffic — counting them would keep a genuinely idle gateway awake. This
# clock is what the idle predicate (gateway/scale_to_zero.is_idle) reads.
if not is_internal:
self._last_inbound_at = time.time()
self._scale_to_zero_note_real_inbound()
# Fire pre_gateway_dispatch plugin hook for user-originated messages.
# Plugins receive the MessageEvent and may return a dict influencing flow:

View file

@ -113,6 +113,37 @@ def test_bg_work_blocks_idle_via_background_tasks(monkeypatch):
loop.close()
def test_bg_work_blocks_idle_via_async_delegation(monkeypatch):
"""delegate_task(background=true) lives in tools.async_delegation, not the
process registry. An active background delegation must block suspend too."""
r = GatewayRunner.__new__(GatewayRunner)
r._background_tasks = set()
monkeypatch.setattr("tools.async_delegation.active_count", lambda: 1)
assert r._scale_to_zero_has_live_background_work() is True
def test_real_inbound_after_dormancy_restores_running_status(monkeypatch):
"""Once a dormant gateway receives real inbound after wake, the runtime
lifecycle must not remain stuck in the watcher-written `draining` state."""
r = GatewayRunner.__new__(GatewayRunner)
r._last_inbound_at = 0.0
r._scale_to_zero_cooldown_until = time.time() + 60.0
status_updates = []
monkeypatch.setattr(
r,
"_update_runtime_status",
lambda state=None, *a, **k: status_updates.append(state),
raising=False,
)
r._scale_to_zero_note_real_inbound()
assert r._last_inbound_at > 0.0
assert status_updates == ["running"]
def test_bg_work_false_when_quiet():
r = GatewayRunner.__new__(GatewayRunner)
r._background_tasks = set()