fix(update): reset-failed before every fallback restart so the gateway can't get stranded (#21371)

cmd_update's auto-restart path could leave the gateway dead after a
transient failure in systemd's own auto-restart window.  Reproduced
on Ubuntu 25.10 + systemd 257: after update, gateway drains and exits 75,
systemd's first respawn 60s later fails (status=200/CHDIR with
"No such file or directory" on a WorkingDirectory that demonstrably
exists), the unit ends up in RestartMaxDelaySec=300 backoff, and
cmd_update's fallback 'systemctl restart' never recovers it — leaving
users with a permanently silent gateway until they manually run
'systemctl reset-failed'.

The fix mirrors the recovery pattern 'hermes gateway restart'
(systemd_restart) got in PR #20949: always reset-failed before
restart, on both the initial fallback and the retry.  Also rewrites
the final failure message to tell the user to reset-failed +
restart (not just restart, which is the step that already failed
twice).
This commit is contained in:
Teknium 2026-05-07 08:34:12 -07:00 committed by GitHub
parent 04918345ea
commit 1d2029b2b7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 261 additions and 3 deletions

View file

@ -7735,6 +7735,23 @@ def _cmd_update_impl(args, gateway_mode: bool):
# when the graceful path failed (unit missing # when the graceful path failed (unit missing
# SIGUSR1 wiring, drain exceeded the budget, # SIGUSR1 wiring, drain exceeded the budget,
# restart-policy mismatch). # restart-policy mismatch).
#
# Always `reset-failed` first. If systemd's own
# auto-restart attempts already parked the unit
# in a failed state (transient CHDIR / OOM /
# filesystem race after our drain + exit-75),
# a plain `systemctl restart` can wedge against
# the RestartSec backoff and leave the unit
# dead. Clearing the failed state first makes
# the restart idempotent. Mirrors the recovery
# path in `hermes gateway restart`
# (`systemd_restart()`) as of PR #20949.
subprocess.run(
scope_cmd + ["reset-failed", svc_name],
capture_output=True,
text=True,
timeout=10,
)
restart = subprocess.run( restart = subprocess.run(
scope_cmd + ["restart", svc_name], scope_cmd + ["restart", svc_name],
capture_output=True, capture_output=True,
@ -7754,10 +7771,19 @@ def _cmd_update_impl(args, gateway_mode: bool):
else: else:
# Retry once — transient startup failures # Retry once — transient startup failures
# (stale module cache, import race) often # (stale module cache, import race) often
# resolve on the second attempt. # resolve on the second attempt. Again
# clear any failed state first so the
# retry isn't blocked by the previous
# crash.
print( print(
f"{svc_name} died after restart, retrying..." f"{svc_name} died after restart, retrying..."
) )
subprocess.run(
scope_cmd + ["reset-failed", svc_name],
capture_output=True,
text=True,
timeout=10,
)
subprocess.run( subprocess.run(
scope_cmd + ["restart", svc_name], scope_cmd + ["restart", svc_name],
capture_output=True, capture_output=True,
@ -7772,10 +7798,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
restarted_services.append(svc_name) restarted_services.append(svc_name)
print(f"{svc_name} recovered on retry") print(f"{svc_name} recovered on retry")
else: else:
_scope_flag = "--user " if scope == "user" else ""
print( print(
f"{svc_name} failed to stay running after restart.\n" f"{svc_name} failed to stay running after restart.\n"
f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n" f" Check logs: journalctl {_scope_flag}-u {svc_name} --since '2 min ago'\n"
f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}" f" Recover manually:\n"
f" systemctl {_scope_flag}reset-failed {svc_name}\n"
f" systemctl {_scope_flag}restart {svc_name}"
) )
else: else:
print( print(

View file

@ -1356,3 +1356,232 @@ class TestCmdUpdateLegacyGatewayWarning:
assert "Legacy Hermes gateway" in captured assert "Legacy Hermes gateway" in captured
assert "(system scope)" in captured assert "(system scope)" in captured
assert "sudo" in captured assert "sudo" in captured
# ---------------------------------------------------------------------------
# cmd_update — reset-failed precedes systemctl restart on fallback path
# ---------------------------------------------------------------------------
def _systemctl_calls(mock_run, subcommand):
"""Return every subprocess.run call that was `systemctl [--user] <subcommand>`."""
out = []
for call in mock_run.call_args_list:
argv = call.args[0]
joined = " ".join(str(c) for c in argv)
if "systemctl" in joined and subcommand in joined:
out.append(argv)
return out
class TestCmdUpdateResetFailedBeforeRestart:
"""`hermes update` must call `systemctl reset-failed` before every
fallback `systemctl restart` so a systemd-parked `failed` state from
earlier auto-restart crashes (CHDIR, OOM, filesystem race) doesn't
permanently strand the unit.
Mirrors the recovery pattern `hermes gateway restart` (systemd_restart)
adopted in PR #20949. Without this, users hit "gateway never comes
back after update" until they manually run `systemctl reset-failed`.
"""
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_reset_failed_runs_before_fallback_restart(
self, mock_run, _mock_which, mock_args, monkeypatch,
):
"""When SIGUSR1 drain times out, the fallback systemctl restart
MUST be preceded by a `reset-failed` call against the same unit."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
mock_run.side_effect = _make_run_side_effect(
commit_count="3",
systemd_active=True,
)
# Force the graceful SIGUSR1 path to report failure so cmd_update
# falls back to systemctl restart.
orig = mock_run.side_effect
def wrapped(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
return orig(cmd, **kwargs)
mock_run.side_effect = wrapped
monkeypatch.setattr(
"hermes_cli.gateway._graceful_restart_via_sigusr1",
lambda pid, drain_timeout: False,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
reset_calls = _systemctl_calls(mock_run, "reset-failed")
restart_calls = _systemctl_calls(mock_run, "restart")
assert any(
"hermes-gateway" in " ".join(str(c) for c in call)
for call in reset_calls
), (
"Expected `systemctl reset-failed hermes-gateway` before the "
"fallback `systemctl restart`, got reset_calls=%r" % (reset_calls,)
)
assert restart_calls, "Fallback systemctl restart should still run"
# Order check: the first reset-failed must come before the first restart.
first_reset_idx = None
first_restart_idx = None
for idx, call in enumerate(mock_run.call_args_list):
joined = " ".join(str(c) for c in call.args[0])
if "systemctl" in joined and "reset-failed" in joined and first_reset_idx is None:
first_reset_idx = idx
if "systemctl" in joined and "restart" in joined and "hermes-gateway" in joined:
if first_restart_idx is None:
first_restart_idx = idx
assert first_reset_idx is not None and first_restart_idx is not None
assert first_reset_idx < first_restart_idx, (
f"reset-failed (call #{first_reset_idx}) must precede "
f"restart (call #{first_restart_idx}) so the unit isn't "
"blocked by systemd's failed-state backoff."
)
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_reset_failed_also_runs_before_retry_restart(
self, mock_run, _mock_which, mock_args, monkeypatch,
):
"""If the first fallback restart spawns a process that dies
immediately (is-active stays inactive), the retry restart must
ALSO be preceded by a reset-failed otherwise the retry races
the unit's own failed-state transition."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
# is-active toggles:
# first call (discovery / check active) -> "active"
# later calls (post-restart verify) -> "inactive"
# Using a state counter so both the initial check and the verify
# loops behave realistically.
is_active_calls = {"n": 0}
def side_effect(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "rev-parse" in joined and "--abbrev-ref" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
if "rev-parse" in joined and "--verify" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "rev-list" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
if "systemctl" in joined and "list-units" in joined:
if "--user" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "systemctl" in joined and "is-active" in joined:
is_active_calls["n"] += 1
# First check: the unit is active (so we enter the restart path).
# Subsequent polling: inactive, which drives the retry branch.
if is_active_calls["n"] == 1:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
mock_run.side_effect = side_effect
# Force graceful SIGUSR1 to fail → fallback restart path.
monkeypatch.setattr(
"hermes_cli.gateway._graceful_restart_via_sigusr1",
lambda pid, drain_timeout: False,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
reset_calls = _systemctl_calls(mock_run, "reset-failed")
restart_calls = _systemctl_calls(mock_run, "restart")
# Two restart attempts (initial + retry), two reset-failed calls.
gateway_restarts = [
c for c in restart_calls
if "hermes-gateway" in " ".join(str(a) for a in c)
]
gateway_resets = [
c for c in reset_calls
if "hermes-gateway" in " ".join(str(a) for a in c)
]
assert len(gateway_restarts) >= 2, (
f"Expected both initial + retry restart calls, got {len(gateway_restarts)}"
)
assert len(gateway_resets) >= 2, (
f"Expected reset-failed before BOTH restart attempts, "
f"got {len(gateway_resets)} reset-failed call(s)"
)
@patch("shutil.which", return_value=None)
@patch("subprocess.run")
def test_final_failure_message_tells_user_to_reset_failed(
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
):
"""When both fallback restart attempts fail, the final error
message must include `systemctl reset-failed` as part of the
manual recovery hint not just `systemctl restart` on its own,
which is the step that just failed twice."""
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
is_active_calls = {"n": 0}
def side_effect(cmd, **kwargs):
joined = " ".join(str(c) for c in cmd)
if "rev-parse" in joined and "--abbrev-ref" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
if "rev-parse" in joined and "--verify" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "rev-list" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
if "systemctl" in joined and "list-units" in joined:
if "--user" in joined:
return subprocess.CompletedProcess(
cmd, 0,
stdout="hermes-gateway.service loaded active running\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
if "systemctl" in joined and "is-active" in joined:
is_active_calls["n"] += 1
if is_active_calls["n"] == 1:
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
mock_run.side_effect = side_effect
monkeypatch.setattr(
"hermes_cli.gateway._graceful_restart_via_sigusr1",
lambda pid, drain_timeout: False,
)
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
cmd_update(mock_args)
captured = capsys.readouterr().out
assert "failed to stay running" in captured, (
"Expected the terminal failure message to fire when both "
f"restart attempts don't survive. Got:\n{captured}"
)
assert "reset-failed" in captured, (
"Final recovery hint must include `reset-failed` so users "
"know how to escape systemd's parked failed state. Got:\n"
f"{captured}"
)
assert "hermes-gateway" in captured