From 78b0008f4451c4b3047107926e466dcfc257ae3e Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 8 May 2026 17:00:21 -0700 Subject: [PATCH] fix(gateway): also catch restart TimeoutExpired; friendly message Extends #19994 to the restart path. Dashboard spawns 'hermes gateway restart' in the background; when a wedged adapter websocket pushes drain past the 90s CLI timeout, the dashboard previously surfaced a raw subprocess.TimeoutExpired traceback. Mirror systemd_stop()'s TimeoutExpired catch onto both forcing-restart sites in systemd_restart(). Adds a test that exercises the no-active-pid branch end-to-end. --- hermes_cli/gateway.py | 14 +++++++++ tests/hermes_cli/test_gateway_service.py | 39 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index adee8cd44b..9b851d99f1 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2456,6 +2456,13 @@ def systemd_restart(system: bool = False): _print_systemd_start_limit_wait(system=system) return raise + except subprocess.TimeoutExpired: + label = _service_scope_label(system) + print( + f"Gateway {label} service is still restarting after 90s; " + "check `hermes gateway status` or logs for final state." + ) + return _wait_for_systemd_service_restart(system=system, previous_pid=pid) return @@ -2475,6 +2482,13 @@ def systemd_restart(system: bool = False): _print_systemd_start_limit_wait(system=system) return raise + except subprocess.TimeoutExpired: + label = _service_scope_label(system) + print( + f"Gateway {label} service is still restarting after 90s; " + "check `hermes gateway status` or logs for final state." + ) + return _wait_for_systemd_service_restart(system=system, previous_pid=pid) diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 3b68476fbd..47de6013df 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -164,6 +164,45 @@ class TestSystemdServiceRefresh: assert "still stopping after 90s" in output assert "hermes gateway status" in output + def test_systemd_restart_timeout_prints_status_guidance(self, monkeypatch, capsys): + """`hermes gateway restart` must not surface a raw TimeoutExpired traceback. + + The dashboard spawns `hermes gateway restart` in the background; when a + wedged adapter websocket pushes drain past the 90s CLI timeout, the + dashboard would previously show a Python traceback (issue #19937 + follow-up: the same failure mode applies to restart, not just stop). + """ + monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + monkeypatch.setattr(gateway_cli, "_require_service_installed", lambda action, system=False: None) + monkeypatch.setattr(gateway_cli, "_preflight_user_systemd", lambda: None) + monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None) + monkeypatch.setattr(status, "get_running_pid", lambda cleanup_stale=True: None) + monkeypatch.setattr(gateway_cli, "_systemd_main_pid", lambda system=False: None) + monkeypatch.setattr( + gateway_cli, + "_recover_pending_systemd_restart", + lambda system=False, previous_pid=None: False, + ) + monkeypatch.setattr( + gateway_cli, + "_systemd_service_is_start_limited", + lambda system=False: False, + ) + + def fake_run_systemctl(args, **kwargs): + # reset-failed is a pre-step (check=False, 30s) — let it pass. + if args and args[0] == "reset-failed": + return SimpleNamespace(returncode=0, stdout="", stderr="") + raise subprocess.TimeoutExpired(args, kwargs.get("timeout")) + + monkeypatch.setattr(gateway_cli, "_run_systemctl", fake_run_systemctl) + + gateway_cli.systemd_restart() + + output = capsys.readouterr().out + assert "still restarting after 90s" in output + assert "hermes gateway status" in output + def test_run_gateway_refreshes_outdated_unit_on_boot(self, tmp_path, monkeypatch): """run_gateway() should refresh the systemd unit on boot so that restart settings take effect even when the process was respawned