mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-11 03:31:55 +00:00
fix(update): bypass systemd RestartSec after graceful drain (#22101)
After a clean SIGUSR1 drain, cmd_update passively polled for systemd's auto-restart to fire. Our unit file sets RestartSec=60 (a crash-loop guard), so the voluntary-restart path waited a full minute of dead air before the gateway came back — the user saw 'draining (up to 75s)...' and stared at it. Change: after the drain exits with code 75, call 'reset-failed' + 'start' explicitly. Manual start bypasses RestartSec entirely (RestartSec only governs systemd's own auto-restart logic). Takes about as long as the gateway needs to come up (~1-3s on a warm box) instead of ~60s. The RestartSec=60 default stays — it's the right crash-loop guard for actual crashes. This only short-circuits the voluntary-restart path. Matches the pattern already used in 'hermes gateway restart' (systemd_restart() in hermes_cli/gateway.py, PR #20949). Tests: - tests/hermes_cli/test_update_gateway_restart.py: new test_update_bypasses_restartsec_after_graceful_drain asserts both 'reset-failed hermes-gateway' AND 'start hermes-gateway' (NOT 'restart') are issued after a successful graceful drain. - All existing tests in the affected classes still pass (TestCmdUpdateLaunchdRestart, TestCmdUpdateResetFailedBeforeRestart are green; one pre-existing flake in the latter is unrelated).
This commit is contained in:
parent
5089596685
commit
d971b26bfd
2 changed files with 121 additions and 8 deletions
|
|
@ -653,6 +653,77 @@ class TestCmdUpdateLaunchdRestart:
|
|||
"Drain path failed; expected fallback `systemctl restart`."
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_bypasses_restartsec_after_graceful_drain(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||
):
|
||||
"""After a graceful SIGUSR1 drain, cmd_update must issue
|
||||
``reset-failed`` + ``start`` to bypass the unit's ``RestartSec``
|
||||
cooldown (default 60s on our unit file) rather than passively
|
||||
waiting for systemd's auto-restart. Collapses the post-drain delay
|
||||
from ~60s to ~5s on a voluntary restart.
|
||||
"""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
||||
def side_effect(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||
if "rev-parse" in joined and "--verify" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "rev-list" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||
if "systemctl" in joined and "list-units" in joined:
|
||||
if "--user" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "systemctl" in joined and "is-active" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
mock_run.side_effect = side_effect
|
||||
|
||||
# Simulate a successful graceful drain so cmd_update reaches the
|
||||
# post-drain restart bypass.
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||
lambda pid, drain_timeout: True,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||
cmd_update(mock_args)
|
||||
|
||||
calls = [
|
||||
" ".join(str(a) for a in c.args[0])
|
||||
for c in mock_run.call_args_list
|
||||
if "systemctl" in " ".join(str(a) for a in c.args[0])
|
||||
]
|
||||
|
||||
# Must have called ``reset-failed hermes-gateway`` AND ``start
|
||||
# hermes-gateway`` explicitly so systemd bypasses RestartSec.
|
||||
reset_calls = [c for c in calls if "reset-failed" in c and "hermes-gateway" in c]
|
||||
start_calls = [
|
||||
c for c in calls
|
||||
if "start" in c and "hermes-gateway" in c and "restart" not in c
|
||||
]
|
||||
assert reset_calls, (
|
||||
f"Expected explicit `reset-failed hermes-gateway` after graceful drain; "
|
||||
f"systemctl calls were: {calls}"
|
||||
)
|
||||
assert start_calls, (
|
||||
f"Expected explicit `start hermes-gateway` after graceful drain to "
|
||||
f"bypass RestartSec; systemctl calls were: {calls}"
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_no_gateway_running_skips_restart(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue