diff --git a/hermes_cli/main.py b/hermes_cli/main.py index b98d30bf8d..062cf5bf19 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7735,6 +7735,23 @@ def _cmd_update_impl(args, gateway_mode: bool): # when the graceful path failed (unit missing # SIGUSR1 wiring, drain exceeded the budget, # restart-policy mismatch). + # + # Always `reset-failed` first. If systemd's own + # auto-restart attempts already parked the unit + # in a failed state (transient CHDIR / OOM / + # filesystem race after our drain + exit-75), + # a plain `systemctl restart` can wedge against + # the RestartSec backoff and leave the unit + # dead. Clearing the failed state first makes + # the restart idempotent. Mirrors the recovery + # path in `hermes gateway restart` + # (`systemd_restart()`) as of PR #20949. + subprocess.run( + scope_cmd + ["reset-failed", svc_name], + capture_output=True, + text=True, + timeout=10, + ) restart = subprocess.run( scope_cmd + ["restart", svc_name], capture_output=True, @@ -7754,10 +7771,19 @@ def _cmd_update_impl(args, gateway_mode: bool): else: # Retry once — transient startup failures # (stale module cache, import race) often - # resolve on the second attempt. + # resolve on the second attempt. Again + # clear any failed state first so the + # retry isn't blocked by the previous + # crash. print( f" ⚠ {svc_name} died after restart, retrying..." ) + subprocess.run( + scope_cmd + ["reset-failed", svc_name], + capture_output=True, + text=True, + timeout=10, + ) subprocess.run( scope_cmd + ["restart", svc_name], capture_output=True, @@ -7772,10 +7798,13 @@ def _cmd_update_impl(args, gateway_mode: bool): restarted_services.append(svc_name) print(f" ✓ {svc_name} recovered on retry") else: + _scope_flag = "--user " if scope == "user" else "" print( f" ✗ {svc_name} failed to stay running after restart.\n" - f" Check logs: journalctl --user -u {svc_name} --since '2 min ago'\n" - f" Restart manually: systemctl {'--user ' if scope == 'user' else ''}restart {svc_name}" + f" Check logs: journalctl {_scope_flag}-u {svc_name} --since '2 min ago'\n" + f" Recover manually:\n" + f" systemctl {_scope_flag}reset-failed {svc_name}\n" + f" systemctl {_scope_flag}restart {svc_name}" ) else: print( diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index aa43acd9e1..dca69abe3f 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -1356,3 +1356,232 @@ class TestCmdUpdateLegacyGatewayWarning: assert "Legacy Hermes gateway" in captured assert "(system scope)" in captured assert "sudo" in captured + + +# --------------------------------------------------------------------------- +# cmd_update — reset-failed precedes systemctl restart on fallback path +# --------------------------------------------------------------------------- + + +def _systemctl_calls(mock_run, subcommand): + """Return every subprocess.run call that was `systemctl [--user] `.""" + out = [] + for call in mock_run.call_args_list: + argv = call.args[0] + joined = " ".join(str(c) for c in argv) + if "systemctl" in joined and subcommand in joined: + out.append(argv) + return out + + +class TestCmdUpdateResetFailedBeforeRestart: + """`hermes update` must call `systemctl reset-failed` before every + fallback `systemctl restart` so a systemd-parked `failed` state from + earlier auto-restart crashes (CHDIR, OOM, filesystem race) doesn't + permanently strand the unit. + + Mirrors the recovery pattern `hermes gateway restart` (systemd_restart) + adopted in PR #20949. Without this, users hit "gateway never comes + back after update" until they manually run `systemctl reset-failed`. + """ + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_reset_failed_runs_before_fallback_restart( + self, mock_run, _mock_which, mock_args, monkeypatch, + ): + """When SIGUSR1 drain times out, the fallback systemctl restart + MUST be preceded by a `reset-failed` call against the same unit.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + systemd_active=True, + ) + + # Force the graceful SIGUSR1 path to report failure so cmd_update + # falls back to systemctl restart. + orig = mock_run.side_effect + def wrapped(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + return orig(cmd, **kwargs) + mock_run.side_effect = wrapped + monkeypatch.setattr( + "hermes_cli.gateway._graceful_restart_via_sigusr1", + lambda pid, drain_timeout: False, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + reset_calls = _systemctl_calls(mock_run, "reset-failed") + restart_calls = _systemctl_calls(mock_run, "restart") + + assert any( + "hermes-gateway" in " ".join(str(c) for c in call) + for call in reset_calls + ), ( + "Expected `systemctl reset-failed hermes-gateway` before the " + "fallback `systemctl restart`, got reset_calls=%r" % (reset_calls,) + ) + assert restart_calls, "Fallback systemctl restart should still run" + + # Order check: the first reset-failed must come before the first restart. + first_reset_idx = None + first_restart_idx = None + for idx, call in enumerate(mock_run.call_args_list): + joined = " ".join(str(c) for c in call.args[0]) + if "systemctl" in joined and "reset-failed" in joined and first_reset_idx is None: + first_reset_idx = idx + if "systemctl" in joined and "restart" in joined and "hermes-gateway" in joined: + if first_restart_idx is None: + first_restart_idx = idx + assert first_reset_idx is not None and first_restart_idx is not None + assert first_reset_idx < first_restart_idx, ( + f"reset-failed (call #{first_reset_idx}) must precede " + f"restart (call #{first_restart_idx}) so the unit isn't " + "blocked by systemd's failed-state backoff." + ) + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_reset_failed_also_runs_before_retry_restart( + self, mock_run, _mock_which, mock_args, monkeypatch, + ): + """If the first fallback restart spawns a process that dies + immediately (is-active stays inactive), the retry restart must + ALSO be preceded by a reset-failed — otherwise the retry races + the unit's own failed-state transition.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + # is-active toggles: + # first call (discovery / check active) -> "active" + # later calls (post-restart verify) -> "inactive" + # Using a state counter so both the initial check and the verify + # loops behave realistically. + is_active_calls = {"n": 0} + + def side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "rev-parse" in joined and "--abbrev-ref" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="") + if "rev-parse" in joined and "--verify" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "rev-list" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="") + if "systemctl" in joined and "list-units" in joined: + if "--user" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded active running\n", + stderr="", + ) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "systemctl" in joined and "is-active" in joined: + is_active_calls["n"] += 1 + # First check: the unit is active (so we enter the restart path). + # Subsequent polling: inactive, which drives the retry branch. + if is_active_calls["n"] == 1: + return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="") + return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="") + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + mock_run.side_effect = side_effect + + # Force graceful SIGUSR1 to fail → fallback restart path. + monkeypatch.setattr( + "hermes_cli.gateway._graceful_restart_via_sigusr1", + lambda pid, drain_timeout: False, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + reset_calls = _systemctl_calls(mock_run, "reset-failed") + restart_calls = _systemctl_calls(mock_run, "restart") + + # Two restart attempts (initial + retry), two reset-failed calls. + gateway_restarts = [ + c for c in restart_calls + if "hermes-gateway" in " ".join(str(a) for a in c) + ] + gateway_resets = [ + c for c in reset_calls + if "hermes-gateway" in " ".join(str(a) for a in c) + ] + assert len(gateway_restarts) >= 2, ( + f"Expected both initial + retry restart calls, got {len(gateway_restarts)}" + ) + assert len(gateway_resets) >= 2, ( + f"Expected reset-failed before BOTH restart attempts, " + f"got {len(gateway_resets)} reset-failed call(s)" + ) + + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_final_failure_message_tells_user_to_reset_failed( + self, mock_run, _mock_which, mock_args, capsys, monkeypatch, + ): + """When both fallback restart attempts fail, the final error + message must include `systemctl reset-failed` as part of the + manual recovery hint — not just `systemctl restart` on its own, + which is the step that just failed twice.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: False) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + + is_active_calls = {"n": 0} + + def side_effect(cmd, **kwargs): + joined = " ".join(str(c) for c in cmd) + if "rev-parse" in joined and "--abbrev-ref" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="") + if "rev-parse" in joined and "--verify" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "rev-list" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="") + if "systemctl" in joined and "list-units" in joined: + if "--user" in joined: + return subprocess.CompletedProcess( + cmd, 0, + stdout="hermes-gateway.service loaded active running\n", + stderr="", + ) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if "systemctl" in joined and "is-active" in joined: + is_active_calls["n"] += 1 + if is_active_calls["n"] == 1: + return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="") + return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="") + if "systemctl" in joined and "show" in joined and "MainPID" in joined: + return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + mock_run.side_effect = side_effect + monkeypatch.setattr( + "hermes_cli.gateway._graceful_restart_via_sigusr1", + lambda pid, drain_timeout: False, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[]): + cmd_update(mock_args) + + captured = capsys.readouterr().out + assert "failed to stay running" in captured, ( + "Expected the terminal failure message to fire when both " + f"restart attempts don't survive. Got:\n{captured}" + ) + assert "reset-failed" in captured, ( + "Final recovery hint must include `reset-failed` so users " + "know how to escape systemd's parked failed state. Got:\n" + f"{captured}" + ) + assert "hermes-gateway" in captured