mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(update): reset-failed before every fallback restart so the gateway can't get stranded (#21371)
cmd_update's auto-restart path could leave the gateway dead after a transient failure in systemd's own auto-restart window. Reproduced on Ubuntu 25.10 + systemd 257: after update, gateway drains and exits 75, systemd's first respawn 60s later fails (status=200/CHDIR with "No such file or directory" on a WorkingDirectory that demonstrably exists), the unit ends up in RestartMaxDelaySec=300 backoff, and cmd_update's fallback 'systemctl restart' never recovers it — leaving users with a permanently silent gateway until they manually run 'systemctl reset-failed'. The fix mirrors the recovery pattern 'hermes gateway restart' (systemd_restart) got in PR #20949: always reset-failed before restart, on both the initial fallback and the retry. Also rewrites the final failure message to tell the user to reset-failed + restart (not just restart, which is the step that already failed twice).
This commit is contained in:
parent
04918345ea
commit
1d2029b2b7
2 changed files with 261 additions and 3 deletions
|
|
@ -7735,6 +7735,23 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||||
# when the graceful path failed (unit missing
|
# when the graceful path failed (unit missing
|
||||||
# SIGUSR1 wiring, drain exceeded the budget,
|
# SIGUSR1 wiring, drain exceeded the budget,
|
||||||
# restart-policy mismatch).
|
# restart-policy mismatch).
|
||||||
|
#
|
||||||
|
# Always `reset-failed` first. If systemd's own
|
||||||
|
# auto-restart attempts already parked the unit
|
||||||
|
# in a failed state (transient CHDIR / OOM /
|
||||||
|
# filesystem race after our drain + exit-75),
|
||||||
|
# a plain `systemctl restart` can wedge against
|
||||||
|
# the RestartSec backoff and leave the unit
|
||||||
|
# dead. Clearing the failed state first makes
|
||||||
|
# the restart idempotent. Mirrors the recovery
|
||||||
|
# path in `hermes gateway restart`
|
||||||
|
# (`systemd_restart()`) as of PR #20949.
|
||||||
|
subprocess.run(
|
||||||
|
scope_cmd + ["reset-failed", svc_name],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
restart = subprocess.run(
|
restart = subprocess.run(
|
||||||
scope_cmd + ["restart", svc_name],
|
scope_cmd + ["restart", svc_name],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
|
|
@ -7754,10 +7771,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||||
else:
|
else:
|
||||||
# Retry once — transient startup failures
|
# Retry once — transient startup failures
|
||||||
# (stale module cache, import race) often
|
# (stale module cache, import race) often
|
||||||
# resolve on the second attempt.
|
# resolve on the second attempt. Again
|
||||||
|
# clear any failed state first so the
|
||||||
|
# retry isn't blocked by the previous
|
||||||
|
# crash.
|
||||||
print(
|
print(
|
||||||
f" ⚠ {svc_name} died after restart, retrying..."
|
f" ⚠ {svc_name} died after restart, retrying..."
|
||||||
)
|
)
|
||||||
|
subprocess.run(
|
||||||
|
scope_cmd + ["reset-failed", svc_name],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
scope_cmd + ["restart", svc_name],
|
scope_cmd + ["restart", svc_name],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
|
|
@ -7772,10 +7798,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||||
restarted_services.append(svc_name)
|
restarted_services.append(svc_name)
|
||||||
print(f" ✓ {svc_name} recovered on retry")
|
print(f" ✓ {svc_name} recovered on retry")
|
||||||
else:
|
else:
|
||||||
|
_scope_flag = "--user " if scope == "user" else ""
|
||||||
print(
|
print(
|
||||||
f" ✗ {svc_name} failed to stay running after restart.\n"
|
f" ✗ {svc_name} failed to stay running after restart.\n"
|
||||||
f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n"
|
f" Check logs: journalctl {_scope_flag}-u {svc_name} --since '2 min ago'\n"
|
||||||
f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}"
|
f" Recover manually:\n"
|
||||||
|
f" systemctl {_scope_flag}reset-failed {svc_name}\n"
|
||||||
|
f" systemctl {_scope_flag}restart {svc_name}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
|
|
|
||||||
|
|
@ -1356,3 +1356,232 @@ class TestCmdUpdateLegacyGatewayWarning:
|
||||||
assert "Legacy Hermes gateway" in captured
|
assert "Legacy Hermes gateway" in captured
|
||||||
assert "(system scope)" in captured
|
assert "(system scope)" in captured
|
||||||
assert "sudo" in captured
|
assert "sudo" in captured
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# cmd_update — reset-failed precedes systemctl restart on fallback path
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _systemctl_calls(mock_run, subcommand):
|
||||||
|
"""Return every subprocess.run call that was `systemctl [--user] <subcommand>`."""
|
||||||
|
out = []
|
||||||
|
for call in mock_run.call_args_list:
|
||||||
|
argv = call.args[0]
|
||||||
|
joined = " ".join(str(c) for c in argv)
|
||||||
|
if "systemctl" in joined and subcommand in joined:
|
||||||
|
out.append(argv)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class TestCmdUpdateResetFailedBeforeRestart:
|
||||||
|
"""`hermes update` must call `systemctl reset-failed` before every
|
||||||
|
fallback `systemctl restart` so a systemd-parked `failed` state from
|
||||||
|
earlier auto-restart crashes (CHDIR, OOM, filesystem race) doesn't
|
||||||
|
permanently strand the unit.
|
||||||
|
|
||||||
|
Mirrors the recovery pattern `hermes gateway restart` (systemd_restart)
|
||||||
|
adopted in PR #20949. Without this, users hit "gateway never comes
|
||||||
|
back after update" until they manually run `systemctl reset-failed`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@patch("shutil.which", return_value=None)
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_reset_failed_runs_before_fallback_restart(
|
||||||
|
self, mock_run, _mock_which, mock_args, monkeypatch,
|
||||||
|
):
|
||||||
|
"""When SIGUSR1 drain times out, the fallback systemctl restart
|
||||||
|
MUST be preceded by a `reset-failed` call against the same unit."""
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||||
|
|
||||||
|
mock_run.side_effect = _make_run_side_effect(
|
||||||
|
commit_count="3",
|
||||||
|
systemd_active=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Force the graceful SIGUSR1 path to report failure so cmd_update
|
||||||
|
# falls back to systemctl restart.
|
||||||
|
orig = mock_run.side_effect
|
||||||
|
def wrapped(cmd, **kwargs):
|
||||||
|
joined = " ".join(str(c) for c in cmd)
|
||||||
|
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||||
|
return orig(cmd, **kwargs)
|
||||||
|
mock_run.side_effect = wrapped
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||||
|
lambda pid, drain_timeout: False,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||||
|
cmd_update(mock_args)
|
||||||
|
|
||||||
|
reset_calls = _systemctl_calls(mock_run, "reset-failed")
|
||||||
|
restart_calls = _systemctl_calls(mock_run, "restart")
|
||||||
|
|
||||||
|
assert any(
|
||||||
|
"hermes-gateway" in " ".join(str(c) for c in call)
|
||||||
|
for call in reset_calls
|
||||||
|
), (
|
||||||
|
"Expected `systemctl reset-failed hermes-gateway` before the "
|
||||||
|
"fallback `systemctl restart`, got reset_calls=%r" % (reset_calls,)
|
||||||
|
)
|
||||||
|
assert restart_calls, "Fallback systemctl restart should still run"
|
||||||
|
|
||||||
|
# Order check: the first reset-failed must come before the first restart.
|
||||||
|
first_reset_idx = None
|
||||||
|
first_restart_idx = None
|
||||||
|
for idx, call in enumerate(mock_run.call_args_list):
|
||||||
|
joined = " ".join(str(c) for c in call.args[0])
|
||||||
|
if "systemctl" in joined and "reset-failed" in joined and first_reset_idx is None:
|
||||||
|
first_reset_idx = idx
|
||||||
|
if "systemctl" in joined and "restart" in joined and "hermes-gateway" in joined:
|
||||||
|
if first_restart_idx is None:
|
||||||
|
first_restart_idx = idx
|
||||||
|
assert first_reset_idx is not None and first_restart_idx is not None
|
||||||
|
assert first_reset_idx < first_restart_idx, (
|
||||||
|
f"reset-failed (call #{first_reset_idx}) must precede "
|
||||||
|
f"restart (call #{first_restart_idx}) so the unit isn't "
|
||||||
|
"blocked by systemd's failed-state backoff."
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("shutil.which", return_value=None)
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_reset_failed_also_runs_before_retry_restart(
|
||||||
|
self, mock_run, _mock_which, mock_args, monkeypatch,
|
||||||
|
):
|
||||||
|
"""If the first fallback restart spawns a process that dies
|
||||||
|
immediately (is-active stays inactive), the retry restart must
|
||||||
|
ALSO be preceded by a reset-failed — otherwise the retry races
|
||||||
|
the unit's own failed-state transition."""
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||||
|
|
||||||
|
# is-active toggles:
|
||||||
|
# first call (discovery / check active) -> "active"
|
||||||
|
# later calls (post-restart verify) -> "inactive"
|
||||||
|
# Using a state counter so both the initial check and the verify
|
||||||
|
# loops behave realistically.
|
||||||
|
is_active_calls = {"n": 0}
|
||||||
|
|
||||||
|
def side_effect(cmd, **kwargs):
|
||||||
|
joined = " ".join(str(c) for c in cmd)
|
||||||
|
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||||
|
if "rev-parse" in joined and "--verify" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
if "rev-list" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||||
|
if "systemctl" in joined and "list-units" in joined:
|
||||||
|
if "--user" in joined:
|
||||||
|
return subprocess.CompletedProcess(
|
||||||
|
cmd, 0,
|
||||||
|
stdout="hermes-gateway.service loaded active running\n",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
if "systemctl" in joined and "is-active" in joined:
|
||||||
|
is_active_calls["n"] += 1
|
||||||
|
# First check: the unit is active (so we enter the restart path).
|
||||||
|
# Subsequent polling: inactive, which drives the retry branch.
|
||||||
|
if is_active_calls["n"] == 1:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||||
|
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
|
||||||
|
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
|
||||||
|
mock_run.side_effect = side_effect
|
||||||
|
|
||||||
|
# Force graceful SIGUSR1 to fail → fallback restart path.
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||||
|
lambda pid, drain_timeout: False,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||||
|
cmd_update(mock_args)
|
||||||
|
|
||||||
|
reset_calls = _systemctl_calls(mock_run, "reset-failed")
|
||||||
|
restart_calls = _systemctl_calls(mock_run, "restart")
|
||||||
|
|
||||||
|
# Two restart attempts (initial + retry), two reset-failed calls.
|
||||||
|
gateway_restarts = [
|
||||||
|
c for c in restart_calls
|
||||||
|
if "hermes-gateway" in " ".join(str(a) for a in c)
|
||||||
|
]
|
||||||
|
gateway_resets = [
|
||||||
|
c for c in reset_calls
|
||||||
|
if "hermes-gateway" in " ".join(str(a) for a in c)
|
||||||
|
]
|
||||||
|
assert len(gateway_restarts) >= 2, (
|
||||||
|
f"Expected both initial + retry restart calls, got {len(gateway_restarts)}"
|
||||||
|
)
|
||||||
|
assert len(gateway_resets) >= 2, (
|
||||||
|
f"Expected reset-failed before BOTH restart attempts, "
|
||||||
|
f"got {len(gateway_resets)} reset-failed call(s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch("shutil.which", return_value=None)
|
||||||
|
@patch("subprocess.run")
|
||||||
|
def test_final_failure_message_tells_user_to_reset_failed(
|
||||||
|
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||||
|
):
|
||||||
|
"""When both fallback restart attempts fail, the final error
|
||||||
|
message must include `systemctl reset-failed` as part of the
|
||||||
|
manual recovery hint — not just `systemctl restart` on its own,
|
||||||
|
which is the step that just failed twice."""
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||||
|
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
|
||||||
|
|
||||||
|
is_active_calls = {"n": 0}
|
||||||
|
|
||||||
|
def side_effect(cmd, **kwargs):
|
||||||
|
joined = " ".join(str(c) for c in cmd)
|
||||||
|
if "rev-parse" in joined and "--abbrev-ref" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
|
||||||
|
if "rev-parse" in joined and "--verify" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
if "rev-list" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
|
||||||
|
if "systemctl" in joined and "list-units" in joined:
|
||||||
|
if "--user" in joined:
|
||||||
|
return subprocess.CompletedProcess(
|
||||||
|
cmd, 0,
|
||||||
|
stdout="hermes-gateway.service loaded active running\n",
|
||||||
|
stderr="",
|
||||||
|
)
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
if "systemctl" in joined and "is-active" in joined:
|
||||||
|
is_active_calls["n"] += 1
|
||||||
|
if is_active_calls["n"] == 1:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
|
||||||
|
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
|
||||||
|
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
|
||||||
|
mock_run.side_effect = side_effect
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"hermes_cli.gateway._graceful_restart_via_sigusr1",
|
||||||
|
lambda pid, drain_timeout: False,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
|
||||||
|
cmd_update(mock_args)
|
||||||
|
|
||||||
|
captured = capsys.readouterr().out
|
||||||
|
assert "failed to stay running" in captured, (
|
||||||
|
"Expected the terminal failure message to fire when both "
|
||||||
|
f"restart attempts don't survive. Got:\n{captured}"
|
||||||
|
)
|
||||||
|
assert "reset-failed" in captured, (
|
||||||
|
"Final recovery hint must include `reset-failed` so users "
|
||||||
|
"know how to escape systemd's parked failed state. Got:\n"
|
||||||
|
f"{captured}"
|
||||||
|
)
|
||||||
|
assert "hermes-gateway" in captured
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue