From d971b26bfd8305285cac1f47c84cceef67624701 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 8 May 2026 16:11:07 -0700 Subject: [PATCH] fix(update): bypass systemd RestartSec after graceful drain (#22101) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After a clean SIGUSR1 drain, cmd_update passively polled for systemd's auto-restart to fire. Our unit file sets RestartSec=60 (a crash-loop guard), so the voluntary-restart path waited a full minute of dead air before the gateway came back — the user saw 'draining (up to 75s)...' and stared at it. Change: after the drain exits with code 75, call 'reset-failed' + 'start' explicitly. Manual start bypasses RestartSec entirely (RestartSec only governs systemd's own auto-restart logic). Takes about as long as the gateway needs to come up (~1-3s on a warm box) instead of ~60s. The RestartSec=60 default stays — it's the right crash-loop guard for actual crashes. This only short-circuits the voluntary-restart path. Matches the pattern already used in 'hermes gateway restart' (systemd_restart() in hermes_cli/gateway.py, PR #20949). Tests: - tests/hermes_cli/test_update_gateway_restart.py: new test_update_bypasses_restartsec_after_graceful_drain asserts both 'reset-failed hermes-gateway' AND 'start hermes-gateway' (NOT 'restart') are issued after a successful graceful drain. - All existing tests in the affected classes still pass (TestCmdUpdateLaunchdRestart, TestCmdUpdateResetFailedBeforeRestart are green; one pre-existing flake in the latter is unrelated). --- hermes_cli/main.py | 58 ++++++++++++--- .../hermes_cli/test_update_gateway_restart.py | 71 +++++++++++++++++++ 2 files changed, 121 insertions(+), 8 deletions(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index b523c4491b..0ec9cfd38f 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7756,14 +7756,56 @@ def _cmd_update_impl(args, gateway_mode: bool): ) if _graceful_ok: - # Gateway exited 75; systemd should relaunch - # via Restart=on-failure. The unit's - # RestartSec (default 30s on ours) gates the - # respawn — poll past that + slack so we - # don't give up mid-cooldown and falsely - # print "drained but didn't relaunch". For - # units without RestartSec set we fall back - # to the original 10s budget. + # Gateway exited 75. ``Restart=always`` + + # ``RestartForceExitStatus=75`` means systemd + # WILL respawn the unit — but only after + # ``RestartSec`` (default 60s on our unit + # file). That 60s wait is a crash-loop guard, + # and is the right default when the gateway + # dies unexpectedly. For a voluntary restart + # on update, it's dead time the user watches. + # + # Shortcut it: ``reset-failed`` + ``start`` + # skips RestartSec entirely (we're manually + # initiating the unit, not waiting for + # systemd's auto-restart logic). Takes about + # as long as the process takes to come up + # (~1-3s on a warm box). + # + # If the unit is already active because + # RestartSec elapsed while we were draining, + # ``start`` is a no-op and we fall through to + # the poll below. Either way we collapse the + # 60s+ delay to a ~5s one. + subprocess.run( + scope_cmd + ["reset-failed", svc_name], + capture_output=True, + text=True, + timeout=10, + ) + subprocess.run( + scope_cmd + ["start", svc_name], + capture_output=True, + text=True, + timeout=15, + ) + # Short poll: the gateway should be up within + # a few seconds now that we bypassed + # RestartSec. Fall back to the longer + # RestartSec + slack budget ONLY if the + # explicit start failed and we need to rely + # on systemd's auto-restart. + if _wait_for_service_active( + scope_cmd, + svc_name, + timeout=10.0, + ): + restarted_services.append(svc_name) + continue + # Explicit start didn't take. Fall back to + # the original passive poll (systemd's + # auto-restart WILL fire after RestartSec + # regardless). _restart_sec = _service_restart_sec( scope_cmd, svc_name, diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index dca69abe3f..5493acb52c 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -653,6 +653,77 @@ class TestCmdUpdateLaunchdRestart: "Drain path failed; expected fallback `systemctl restart`." ) + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_bypasses_restartsec_after_graceful_drain( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, + ): + """After a graceful SIGUSR1 drain, cmd_update must issue + ``reset-failed`` + ``start`` to bypass the unit's ``RestartSec`` + cooldown (default 60s on our unit file) rather than passively + waiting for systemd's auto-restart. Collapses the post-drain delay + from ~60s to ~5s on a voluntary restart. + """ + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + def side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "rev-parse" in joined and "--abbrev-ref" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="") + if "rev-parse" in joined and "--verify" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "rev-list" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="") + if "systemctl" in joined and "list-units" in joined: + if "--user" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded active running\n", + stderr="", + ) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "systemctl" in joined and "is-active" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="") + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + mock_run.side_effect = side_effect + + # Simulate a successful graceful drain so cmd_update reaches the + # post-drain restart bypass. + monkeypatch.setattr( + "hermes_cli.gateway._graceful_restart_via_sigusr1", + lambda pid, drain_timeout: True, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + calls = [ + " ".join(str(a) for a in c.args[0]) + for c in mock_run.call_args_list + if "systemctl" in " ".join(str(a) for a in c.args[0]) + ] + + # Must have called ``reset-failed hermes-gateway`` AND ``start + # hermes-gateway`` explicitly so systemd bypasses RestartSec. + reset_calls = [c for c in calls if "reset-failed" in c and "hermes-gateway" in c] + start_calls = [ + c for c in calls + if "start" in c and "hermes-gateway" in c and "restart" not in c + ] + assert reset_calls, ( + f"Expected explicit `reset-failed hermes-gateway` after graceful drain; " + f"systemctl calls were: {calls}" + ) + assert start_calls, ( + f"Expected explicit `start hermes-gateway` after graceful drain to " + f"bypass RestartSec; systemctl calls were: {calls}" + ) + @patch("shutil.which", return_value=None) @patch("subprocess.run") def test_update_no_gateway_running_skips_restart(