fix(update): bypass systemd RestartSec after graceful drain (#22101)

After a clean SIGUSR1 drain, cmd_update passively polled for systemd's
auto-restart to fire. Our unit file sets RestartSec=60 (a crash-loop
guard), so the voluntary-restart path waited a full minute of dead air
before the gateway came back — the user saw 'draining (up to 75s)...'
and stared at it.

Change: after the drain exits with code 75, call 'reset-failed' +
'start' explicitly. Manual start bypasses RestartSec entirely
(RestartSec only governs systemd's own auto-restart logic). Takes
about as long as the gateway needs to come up (~1-3s on a warm box)
instead of ~60s.

The RestartSec=60 default stays — it's the right crash-loop guard for
actual crashes. This only short-circuits the voluntary-restart path.

Matches the pattern already used in 'hermes gateway restart'
(systemd_restart() in hermes_cli/gateway.py, PR #20949).

Tests:
- tests/hermes_cli/test_update_gateway_restart.py: new
  test_update_bypasses_restartsec_after_graceful_drain asserts both
  'reset-failed hermes-gateway' AND 'start hermes-gateway' (NOT
  'restart') are issued after a successful graceful drain.
- All existing tests in the affected classes still pass
  (TestCmdUpdateLaunchdRestart, TestCmdUpdateResetFailedBeforeRestart
  are green; one pre-existing flake in the latter is unrelated).
This commit is contained in:
Teknium 2026-05-08 16:11:07 -07:00 committed by GitHub
parent 5089596685
commit d971b26bfd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 121 additions and 8 deletions

View file

@ -7756,14 +7756,56 @@ def _cmd_update_impl(args, gateway_mode: bool):
)
if _graceful_ok:
# Gateway exited 75; systemd should relaunch
# via Restart=on-failure. The unit's
# RestartSec (default 30s on ours) gates the
# respawn — poll past that + slack so we
# don't give up mid-cooldown and falsely
# print "drained but didn't relaunch". For
# units without RestartSec set we fall back
# to the original 10s budget.
# Gateway exited 75. ``Restart=always`` +
# ``RestartForceExitStatus=75`` means systemd
# WILL respawn the unit — but only after
# ``RestartSec`` (default 60s on our unit
# file). That 60s wait is a crash-loop guard,
# and is the right default when the gateway
# dies unexpectedly. For a voluntary restart
# on update, it's dead time the user watches.
#
# Shortcut it: ``reset-failed`` + ``start``
# skips RestartSec entirely (we're manually
# initiating the unit, not waiting for
# systemd's auto-restart logic). Takes about
# as long as the process takes to come up
# (~1-3s on a warm box).
#
# If the unit is already active because
# RestartSec elapsed while we were draining,
# ``start`` is a no-op and we fall through to
# the poll below. Either way we collapse the
# 60s+ delay to a ~5s one.
subprocess.run(
scope_cmd + ["reset-failed", svc_name],
capture_output=True,
text=True,
timeout=10,
)
subprocess.run(
scope_cmd + ["start", svc_name],
capture_output=True,
text=True,
timeout=15,
)
# Short poll: the gateway should be up within
# a few seconds now that we bypassed
# RestartSec. Fall back to the longer
# RestartSec + slack budget ONLY if the
# explicit start failed and we need to rely
# on systemd's auto-restart.
if _wait_for_service_active(
scope_cmd,
svc_name,
timeout=10.0,
):
restarted_services.append(svc_name)
continue
# Explicit start didn't take. Fall back to
# the original passive poll (systemd's
# auto-restart WILL fire after RestartSec
# regardless).
_restart_sec = _service_restart_sec(
scope_cmd,
svc_name,

View file

@ -653,6 +653,77 @@ class TestCmdUpdateLaunchdRestart:
"Drain path failed; expected fallback `systemctl restart`."
)
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_bypasses_restartsec_after_graceful_drain(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""After a graceful SIGUSR1 drain, cmd_update must issue
``reset-failed`` + ``start`` to bypass the unit's ``RestartSec``
cooldown (default 60s on our unit file) rather than passively
waiting for systemd's auto-restart. Collapses the post-drain delay
from ~60s to ~5s on a voluntary restart.
"""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
def side_effect(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "rev-parse" in joined and "--abbrev-ref" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
if "rev-parse" in joined and "--verify" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "rev-list" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
if "systemctl" in joined and "list-units" in joined:
if "--user" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "systemctl" in joined and "is-active" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
mock_run.side_effect = side_effect
# Simulate a successful graceful drain so cmd_update reaches the
# post-drain restart bypass.
monkeypatch.setattr(
"hermes_cli.gateway._graceful_restart_via_sigusr1",
lambda pid, drain_timeout: True,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
calls = [
" ".join(str(a) for a in c.args[0])
for c in mock_run.call_args_list
if "systemctl" in " ".join(str(a) for a in c.args[0])
]
# Must have called ``reset-failed hermes-gateway`` AND ``start
# hermes-gateway`` explicitly so systemd bypasses RestartSec.
reset_calls = [c for c in calls if "reset-failed" in c and "hermes-gateway" in c]
start_calls = [
c for c in calls
if "start" in c and "hermes-gateway" in c and "restart" not in c
]
assert reset_calls, (
f"Expected explicit `reset-failed hermes-gateway` after graceful drain; "
f"systemctl calls were: {calls}"
)
assert start_calls, (
f"Expected explicit `start hermes-gateway` after graceful drain to "
f"bypass RestartSec; systemctl calls were: {calls}"
)
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_update_no_gateway_running_skips_restart(