mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(update): reset-failed before every fallback restart so the gateway can't get stranded (#21371)
cmd_update's auto-restart path could leave the gateway dead after a transient failure in systemd's own auto-restart window. Reproduced on Ubuntu 25.10 + systemd 257: after update, gateway drains and exits 75, systemd's first respawn 60s later fails (status=200/CHDIR with "No such file or directory" on a WorkingDirectory that demonstrably exists), the unit ends up in RestartMaxDelaySec=300 backoff, and cmd_update's fallback 'systemctl restart' never recovers it — leaving users with a permanently silent gateway until they manually run 'systemctl reset-failed'. The fix mirrors the recovery pattern 'hermes gateway restart' (systemd_restart) got in PR #20949: always reset-failed before restart, on both the initial fallback and the retry. Also rewrites the final failure message to tell the user to reset-failed + restart (not just restart, which is the step that already failed twice).
This commit is contained in:
parent
04918345ea
commit
1d2029b2b7
2 changed files with 261 additions and 3 deletions
|
|
@ -7735,6 +7735,23 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
# when the graceful path failed (unit missing
|
||||
# SIGUSR1 wiring, drain exceeded the budget,
|
||||
# restart-policy mismatch).
|
||||
#
|
||||
# Always `reset-failed` first. If systemd's own
|
||||
# auto-restart attempts already parked the unit
|
||||
# in a failed state (transient CHDIR / OOM /
|
||||
# filesystem race after our drain + exit-75),
|
||||
# a plain `systemctl restart` can wedge against
|
||||
# the RestartSec backoff and leave the unit
|
||||
# dead. Clearing the failed state first makes
|
||||
# the restart idempotent. Mirrors the recovery
|
||||
# path in `hermes gateway restart`
|
||||
# (`systemd_restart()`) as of PR #20949.
|
||||
subprocess.run(
|
||||
scope_cmd + ["reset-failed", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
restart = subprocess.run(
|
||||
scope_cmd + ["restart", svc_name],
|
||||
capture_output=True,
|
||||
|
|
@ -7754,10 +7771,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
else:
|
||||
# Retry once — transient startup failures
|
||||
# (stale module cache, import race) often
|
||||
# resolve on the second attempt.
|
||||
# resolve on the second attempt. Again
|
||||
# clear any failed state first so the
|
||||
# retry isn't blocked by the previous
|
||||
# crash.
|
||||
print(
|
||||
f" ⚠ {svc_name} died after restart, retrying..."
|
||||
)
|
||||
subprocess.run(
|
||||
scope_cmd + ["reset-failed", svc_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
subprocess.run(
|
||||
scope_cmd + ["restart", svc_name],
|
||||
capture_output=True,
|
||||
|
|
@ -7772,10 +7798,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
restarted_services.append(svc_name)
|
||||
print(f" ✓ {svc_name} recovered on retry")
|
||||
else:
|
||||
_scope_flag = "--user " if scope == "user" else ""
|
||||
print(
|
||||
f" ✗ {svc_name} failed to stay running after restart.\n"
|
||||
f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n"
|
||||
f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}"
|
||||
f" Check logs: journalctl {_scope_flag}-u {svc_name} --since '2 min ago'\n"
|
||||
f" Recover manually:\n"
|
||||
f" systemctl {_scope_flag}reset-failed {svc_name}\n"
|
||||
f" systemctl {_scope_flag}restart {svc_name}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
|
|
|
|||
|
|
@ -1356,3 +1356,232 @@ class TestCmdUpdateLegacyGatewayWarning:
|
|||
assert "Legacy Hermes gateway" in captured
|
||||
assert "(system scope)" in captured
|
||||
assert "sudo" in captured
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# cmd_update — reset-failed precedes systemctl restart on fallback path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _systemctl_calls(mock_run, subcommand):
|
||||
"""Return every subprocess.run call that was `systemctl [--user] <subcommand>`."""
|
||||
out = []
|
||||
for call in mock_run.call_args_list:
|
||||
argv = call.args[0]
|
||||
joined = " ".join(str(c) for c in argv)
|
||||
if "systemctl" in joined and subcommand in joined:
|
||||
out.append(argv)
|
||||
return out
|
||||
|
||||
|
||||
class TestCmdUpdateResetFailedBeforeRestart:
|
||||
"""`hermes update` must call `systemctl reset-failed` before every
|
||||
fallback `systemctl restart` so a systemd-parked `failed` state from
|
||||
earlier auto-restart crashes (CHDIR, OOM, filesystem race) doesn't
|
||||
permanently strand the unit.
|
||||
|
||||
Mirrors the recovery pattern `hermes gateway restart` (systemd_restart)
|
||||
adopted in PR #20949. Without this, users hit "gateway never comes
|
||||
back after update" until they manually run `systemctl reset-failed`.
|
||||
"""
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_reset_failed_runs_before_fallback_restart(
|
||||
self, mock_run, _mock_which, mock_args, monkeypatch,
|
||||
):
|
||||
"""When SIGUSR1 drain times out, the fallback systemctl restart
|
||||
MUST be preceded by a `reset-failed` call against the same unit."""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
systemd_active=True,
|
||||
)
|
||||
|
||||
# Force the graceful SIGUSR1 path to report failure so cmd_update
|
||||
# falls back to systemctl restart.
|
||||
orig = mock_run.side_effect
|
||||
def wrapped(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||
return orig(cmd, **kwargs)
|
||||
mock_run.side_effect = wrapped
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||
lambda pid, drain_timeout: False,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||
cmd_update(mock_args)
|
||||
|
||||
reset_calls = _systemctl_calls(mock_run, "reset-failed")
|
||||
restart_calls = _systemctl_calls(mock_run, "restart")
|
||||
|
||||
assert any(
|
||||
"hermes-gateway" in " ".join(str(c) for c in call)
|
||||
for call in reset_calls
|
||||
), (
|
||||
"Expected `systemctl reset-failed hermes-gateway` before the "
|
||||
"fallback `systemctl restart`, got reset_calls=%r" % (reset_calls,)
|
||||
)
|
||||
assert restart_calls, "Fallback systemctl restart should still run"
|
||||
|
||||
# Order check: the first reset-failed must come before the first restart.
|
||||
first_reset_idx = None
|
||||
first_restart_idx = None
|
||||
for idx, call in enumerate(mock_run.call_args_list):
|
||||
joined = " ".join(str(c) for c in call.args[0])
|
||||
if "systemctl" in joined and "reset-failed" in joined and first_reset_idx is None:
|
||||
first_reset_idx = idx
|
||||
if "systemctl" in joined and "restart" in joined and "hermes-gateway" in joined:
|
||||
if first_restart_idx is None:
|
||||
first_restart_idx = idx
|
||||
assert first_reset_idx is not None and first_restart_idx is not None
|
||||
assert first_reset_idx < first_restart_idx, (
|
||||
f"reset-failed (call #{first_reset_idx}) must precede "
|
||||
f"restart (call #{first_restart_idx}) so the unit isn't "
|
||||
"blocked by systemd's failed-state backoff."
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_reset_failed_also_runs_before_retry_restart(
|
||||
self, mock_run, _mock_which, mock_args, monkeypatch,
|
||||
):
|
||||
"""If the first fallback restart spawns a process that dies
|
||||
immediately (is-active stays inactive), the retry restart must
|
||||
ALSO be preceded by a reset-failed — otherwise the retry races
|
||||
the unit's own failed-state transition."""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
||||
# is-active toggles:
|
||||
# first call (discovery / check active) -> "active"
|
||||
# later calls (post-restart verify) -> "inactive"
|
||||
# Using a state counter so both the initial check and the verify
|
||||
# loops behave realistically.
|
||||
is_active_calls = {"n": 0}
|
||||
|
||||
def side_effect(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||
if "rev-parse" in joined and "--verify" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "rev-list" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||
if "systemctl" in joined and "list-units" in joined:
|
||||
if "--user" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "systemctl" in joined and "is-active" in joined:
|
||||
is_active_calls["n"] += 1
|
||||
# First check: the unit is active (so we enter the restart path).
|
||||
# Subsequent polling: inactive, which drives the retry branch.
|
||||
if is_active_calls["n"] == 1:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
|
||||
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
mock_run.side_effect = side_effect
|
||||
|
||||
# Force graceful SIGUSR1 to fail → fallback restart path.
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||
lambda pid, drain_timeout: False,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||
cmd_update(mock_args)
|
||||
|
||||
reset_calls = _systemctl_calls(mock_run, "reset-failed")
|
||||
restart_calls = _systemctl_calls(mock_run, "restart")
|
||||
|
||||
# Two restart attempts (initial + retry), two reset-failed calls.
|
||||
gateway_restarts = [
|
||||
c for c in restart_calls
|
||||
if "hermes-gateway" in " ".join(str(a) for a in c)
|
||||
]
|
||||
gateway_resets = [
|
||||
c for c in reset_calls
|
||||
if "hermes-gateway" in " ".join(str(a) for a in c)
|
||||
]
|
||||
assert len(gateway_restarts) >= 2, (
|
||||
f"Expected both initial + retry restart calls, got {len(gateway_restarts)}"
|
||||
)
|
||||
assert len(gateway_resets) >= 2, (
|
||||
f"Expected reset-failed before BOTH restart attempts, "
|
||||
f"got {len(gateway_resets)} reset-failed call(s)"
|
||||
)
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_final_failure_message_tells_user_to_reset_failed(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||
):
|
||||
"""When both fallback restart attempts fail, the final error
|
||||
message must include `systemctl reset-failed` as part of the
|
||||
manual recovery hint — not just `systemctl restart` on its own,
|
||||
which is the step that just failed twice."""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||
|
||||
is_active_calls = {"n": 0}
|
||||
|
||||
def side_effect(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||
if "rev-parse" in joined and "--verify" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "rev-list" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||
if "systemctl" in joined and "list-units" in joined:
|
||||
if "--user" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if "systemctl" in joined and "is-active" in joined:
|
||||
is_active_calls["n"] += 1
|
||||
if is_active_calls["n"] == 1:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
|
||||
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
mock_run.side_effect = side_effect
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||
lambda pid, drain_timeout: False,
|
||||
)
|
||||
|
||||
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
assert "failed to stay running" in captured, (
|
||||
"Expected the terminal failure message to fire when both "
|
||||
f"restart attempts don't survive. Got:\n{captured}"
|
||||
)
|
||||
assert "reset-failed" in captured, (
|
||||
"Final recovery hint must include `reset-failed` so users "
|
||||
"know how to escape systemd's parked failed state. Got:\n"
|
||||
f"{captured}"
|
||||
)
|
||||
assert "hermes-gateway" in captured
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue