diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index fe7bb9bd8..4b13bc70f 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -1128,7 +1128,62 @@ def systemd_restart(system: bool = False): pid = get_running_pid() if pid is not None and _request_gateway_self_restart(pid): - print(f"✓ {_service_scope_label(system).capitalize()} service restart requested") + # SIGUSR1 sent — the gateway will drain active agents, exit with + # code 75, and systemd will restart it after RestartSec (30s). + # Wait for the old process to die and the new one to become active + # so the CLI doesn't return while the service is still restarting. + import time + scope_label = _service_scope_label(system).capitalize() + svc = get_service_name() + scope_cmd = _systemctl_cmd(system) + + # Phase 1: wait for old process to exit (drain + shutdown) + print(f"⏳ {scope_label} service draining active work...") + deadline = time.time() + 90 + while time.time() < deadline: + try: + os.kill(pid, 0) + time.sleep(1) + except (ProcessLookupError, PermissionError): + break # old process is gone + else: + print(f"⚠ Old process (PID {pid}) still alive after 90s") + + # Phase 2: wait for systemd to start the new process + print(f"⏳ Waiting for {svc} to restart...") + deadline = time.time() + 60 + while time.time() < deadline: + try: + result = subprocess.run( + scope_cmd + ["is-active", svc], + capture_output=True, text=True, timeout=5, + ) + if result.stdout.strip() == "active": + # Verify it's a NEW process, not the old one somehow + new_pid = get_running_pid() + if new_pid and new_pid != pid: + print(f"✓ {scope_label} service restarted (PID {new_pid})") + return + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + time.sleep(2) + + # Timed out — check final state + try: + result = subprocess.run( + scope_cmd + ["is-active", svc], + capture_output=True, text=True, timeout=5, + ) + if result.stdout.strip() == "active": + print(f"✓ {scope_label} service restarted") + return + except Exception: + pass + print( + f"⚠ {scope_label} service did not become active within 60s.\n" + f" Check status: {'sudo ' if system else ''}hermes gateway status\n" + f" Check logs: journalctl {'--user ' if not system else ''}-u {svc} --since '2 min ago'" + ) return _run_systemctl(["reload-or-restart", get_service_name()], system=system, check=True, timeout=90) print(f"✓ {_service_scope_label(system).capitalize()} service restarted") diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index ec35aa997..fedbdf4d1 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -452,7 +452,7 @@ class TestGatewayServiceDetection: class TestGatewaySystemServiceRouting: - def test_systemd_restart_self_requests_graceful_restart_without_reload_or_restart(self, monkeypatch, capsys): + def test_systemd_restart_self_requests_graceful_restart_and_waits(self, monkeypatch, capsys): calls = [] monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) @@ -466,16 +466,37 @@ class TestGatewaySystemServiceRouting: "_request_gateway_self_restart", lambda pid: calls.append(("self", pid)) or True, ) - monkeypatch.setattr( - gateway_cli.subprocess, - "run", - lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("systemctl should not run")), - ) + + # Simulate: old process dies immediately, new process becomes active + kill_call_count = [0] + def fake_kill(pid, sig): + kill_call_count[0] += 1 + if kill_call_count[0] >= 2: # first call checks, second = dead + raise ProcessLookupError() + monkeypatch.setattr(os, "kill", fake_kill) + + # Simulate systemctl is-active returning "active" with a new PID + new_pid = [None] + def fake_subprocess_run(cmd, **kwargs): + if "is-active" in cmd: + result = SimpleNamespace(stdout="active\n", returncode=0) + new_pid[0] = 999 # new PID + return result + raise AssertionError(f"Unexpected systemctl call: {cmd}") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run) + # get_running_pid returns new PID after restart + pid_calls = [0] + def fake_get_pid(): + pid_calls[0] += 1 + return 999 if pid_calls[0] > 1 else 654 + monkeypatch.setattr("gateway.status.get_running_pid", fake_get_pid) gateway_cli.systemd_restart() - assert calls == [("refresh", False), ("self", 654)] - assert "restart requested" in capsys.readouterr().out.lower() + assert ("self", 654) in calls + out = capsys.readouterr().out.lower() + assert "restarted" in out def test_gateway_install_passes_system_flags(self, monkeypatch): monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)