mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
fix(update): bypass systemd RestartSec after graceful drain (#22101)
After a clean SIGUSR1 drain, cmd_update passively polled for systemd's auto-restart to fire. Our unit file sets RestartSec=60 (a crash-loop guard), so the voluntary-restart path waited a full minute of dead air before the gateway came back — the user saw 'draining (up to 75s)...' and stared at it. Change: after the drain exits with code 75, call 'reset-failed' + 'start' explicitly. Manual start bypasses RestartSec entirely (RestartSec only governs systemd's own auto-restart logic). Takes about as long as the gateway needs to come up (~1-3s on a warm box) instead of ~60s. The RestartSec=60 default stays — it's the right crash-loop guard for actual crashes. This only short-circuits the voluntary-restart path. Matches the pattern already used in 'hermes gateway restart' (systemd_restart() in hermes_cli/gateway.py, PR #20949). Tests: - tests/hermes_cli/test_update_gateway_restart.py: new test_update_bypasses_restartsec_after_graceful_drain asserts both 'reset-failed hermes-gateway' AND 'start hermes-gateway' (NOT 'restart') are issued after a successful graceful drain. - All existing tests in the affected classes still pass (TestCmdUpdateLaunchdRestart, TestCmdUpdateResetFailedBeforeRestart are green; one pre-existing flake in the latter is unrelated).
This commit is contained in:
parent
5089596685
commit
d971b26bfd
2 changed files with 121 additions and 8 deletions
|
|
@ -7756,14 +7756,56 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
)
|
||||
|
||||
if _graceful_ok:
|
||||
# Gateway exited 75; systemd should relaunch
|
||||
# via Restart=on-failure. The unit's
|
||||
# RestartSec (default 30s on ours) gates the
|
||||
# respawn — poll past that + slack so we
|
||||
# don't give up mid-cooldown and falsely
|
||||
# print "drained but didn't relaunch". For
|
||||
# units without RestartSec set we fall back
|
||||
# to the original 10s budget.
|
||||
# Gateway exited 75. ``Restart=always`` +
|
||||
# ``RestartForceExitStatus=75`` means systemd
|
||||
# WILL respawn the unit — but only after
|
||||
# ``RestartSec`` (default 60s on our unit
|
||||
# file). That 60s wait is a crash-loop guard,
|
||||
# and is the right default when the gateway
|
||||
# dies unexpectedly. For a voluntary restart
|
||||
# on update, it's dead time the user watches.
|
||||
#
|
||||
# Shortcut it: ``reset-failed`` + ``start``
|
||||
# skips RestartSec entirely (we're manually
|
||||
# initiating the unit, not waiting for
|
||||
# systemd's auto-restart logic). Takes about
|
||||
# as long as the process takes to come up
|
||||
# (~1-3s on a warm box).
|
||||
#
|
||||
# If the unit is already active because
|
||||
# RestartSec elapsed while we were draining,
|
||||
# ``start`` is a no-op and we fall through to
|
||||
# the poll below. Either way we collapse the
|
||||
# 60s+ delay to a ~5s one.
|
||||
subprocess.run(
|
||||
scope_cmd + ["reset-failed", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
subprocess.run(
|
||||
scope_cmd + ["start", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
# Short poll: the gateway should be up within
|
||||
# a few seconds now that we bypassed
|
||||
# RestartSec. Fall back to the longer
|
||||
# RestartSec + slack budget ONLY if the
|
||||
# explicit start failed and we need to rely
|
||||
# on systemd's auto-restart.
|
||||
if _wait_for_service_active(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
timeout=10.0,
|
||||
):
|
||||
restarted_services.append(svc_name)
|
||||
continue
|
||||
# Explicit start didn't take. Fall back to
|
||||
# the original passive poll (systemd's
|
||||
# auto-restart WILL fire after RestartSec
|
||||
# regardless).
|
||||
_restart_sec = _service_restart_sec(
|
||||
scope_cmd,
|
||||
svc_name,
|
||||
|
|
|
|||
|
|
@ -653,6 +653,77 @@ class TestCmdUpdateLaunchdRestart:
|
|||
"Drain path failed; expected fallback `systemctl restart`."
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_bypasses_restartsec_after_graceful_drain(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||
):
|
||||
"""After a graceful SIGUSR1 drain, cmd_update must issue
|
||||
``reset-failed`` + ``start`` to bypass the unit's ``RestartSec``
|
||||
cooldown (default 60s on our unit file) rather than passively
|
||||
waiting for systemd's auto-restart. Collapses the post-drain delay
|
||||
from ~60s to ~5s on a voluntary restart.
|
||||
"""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
||||
def side_effect(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||
if "rev-parse" in joined and "--verify" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "rev-list" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||
if "systemctl" in joined and "list-units" in joined:
|
||||
if "--user" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "systemctl" in joined and "is-active" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
mock_run.side_effect = side_effect
|
||||
|
||||
# Simulate a successful graceful drain so cmd_update reaches the
|
||||
# post-drain restart bypass.
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||
lambda pid, drain_timeout: True,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||
cmd_update(mock_args)
|
||||
|
||||
calls = [
|
||||
" ".join(str(a) for a in c.args[0])
|
||||
for c in mock_run.call_args_list
|
||||
if "systemctl" in " ".join(str(a) for a in c.args[0])
|
||||
]
|
||||
|
||||
# Must have called ``reset-failed hermes-gateway`` AND ``start
|
||||
# hermes-gateway`` explicitly so systemd bypasses RestartSec.
|
||||
reset_calls = [c for c in calls if "reset-failed" in c and "hermes-gateway" in c]
|
||||
start_calls = [
|
||||
c for c in calls
|
||||
if "start" in c and "hermes-gateway" in c and "restart" not in c
|
||||
]
|
||||
assert reset_calls, (
|
||||
f"Expected explicit `reset-failed hermes-gateway` after graceful drain; "
|
||||
f"systemctl calls were: {calls}"
|
||||
)
|
||||
assert start_calls, (
|
||||
f"Expected explicit `start hermes-gateway` after graceful drain to "
|
||||
f"bypass RestartSec; systemctl calls were: {calls}"
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_no_gateway_running_skips_restart(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue