fix(update): announce gateway drain waits so desktop updates don't look hung

On macOS, the desktop updater's stage 1 (hermes update --gateway) ends by
restarting running gateways. launchd_restart() SIGTERMs the gateway and
silently waits up to agent.restart_drain_timeout (default 180s) for the
drain; the manual profile-gateway loop waits its drain budget per gateway
the same way. Neither path prints anything before the wait, so the desktop
updater's live output goes dead for minutes right after '✓ Update
complete!' — users read it as a hung update and force-kill their gateway
processes to make it move (#44515). The systemd branch already announces
its drain ('draining (up to Ns)...'); launchd and the manual loop did not.

Print the stop/drain (with PID and budget) before the wait in both paths,
mirroring the systemd branch, and assert the message in the existing
launchd drain test.

Fixes #44515
This commit is contained in:
AIalliAI 2026-06-11 23:45:15 +00:00 committed by Brooklyn Nicholson
parent d335164833
commit 463bf2be25
3 changed files with 25 additions and 1 deletions

View file

@ -3891,6 +3891,16 @@ def launchd_restart():
print("✓ Service restart requested")
return
if pid is not None:
# Announce the drain BEFORE waiting on it. This wait can run for
# the full drain budget (180s by default) while the old gateway
# finishes in-flight agent runs, and it streams into surfaces with
# no other feedback — the desktop updater's live output most of
# all, where a silent stop here reads as "update stuck" (#44515).
# Mirrors the systemd branch's "draining (up to Ns)..." line.
print(
f"→ Stopping gateway (PID {pid}) — draining in-flight runs "
f"(up to {drain_timeout:.0f}s)..."
)
try:
terminate_pid(pid, force=False)
except (ProcessLookupError, PermissionError, OSError):

View file

@ -10178,6 +10178,14 @@ def _cmd_update_impl(args, gateway_mode: bool):
# gateway doesn't support SIGUSR1 or doesn't exit within
# the drain budget, fall back to SIGTERM — the watcher
# still sees the exit and relaunches either way.
# Announce the drain first: this wait can hold for the full
# budget per gateway with no other output, and on surfaces
# that stream update progress (the desktop updater most of
# all) the silence reads as a hung update (#44515).
print(
f"{proc.profile}: draining gateway PID {pid} "
f"(up to {int(_drain_budget)}s)..."
)
drained = _graceful_restart_via_sigusr1(
pid,
drain_timeout=_drain_budget,

View file

@ -774,7 +774,7 @@ class TestLaunchdServiceRecovery:
["launchctl", "kickstart", target],
]
def test_launchd_restart_drains_running_gateway_before_kickstart(self, monkeypatch):
def test_launchd_restart_drains_running_gateway_before_kickstart(self, monkeypatch, capsys):
calls = []
target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}"
@ -799,6 +799,12 @@ class TestLaunchdServiceRecovery:
("term", 321, False),
["launchctl", "kickstart", "-k", target],
]
# The drain can silently hold for the full budget (180s default); the
# desktop updater streams this output as its only progress feedback,
# so the stop must be announced BEFORE the wait (#44515).
out = capsys.readouterr().out
assert "draining in-flight runs" in out
assert "up to 12s" in out
def test_launchd_restart_self_requests_graceful_restart_without_kickstart(self, monkeypatch, capsys):
calls = []