fix: hermes gateway restart waits for service to come back up (#8260)

Previously, systemd_restart() sent SIGUSR1 to the gateway, printed 'restart requested', and returned immediately. The gateway still needed to drain active agents, exit with code 75, wait for systemd's RestartSec=30, and start the new process. The user saw 'success' but the gateway was actually down for 30-60 seconds. Now the SIGUSR1 path blocks with progress feedback: Phase 1 — wait for old process to die: ⏳ User service draining active work... Polls os.kill(pid, 0) until ProcessLookupError (up to 90s) Phase 2 — wait for new process to become active: ⏳ Waiting for hermes-gateway to restart... Polls systemctl is-active + verifies new PID (up to 60s) Success: ✓ User service restarted (PID 12345) Timeout: ⚠ User service did not become active within 60s. Check status: hermes gateway status Check logs: journalctl --user -u hermes-gateway --since '2 min ago' The reload-or-restart fallback path (line 1189) already blocks because systemctl reload-or-restart is synchronous. Test plan: - Updated test to verify wait-for-restart behavior - All 118 gateway CLI tests pass
2026-06-13 09:01:54 +00:00 · 2026-04-14 17:12:12 -07:00 · 2026-04-14 17:12:12 -07:00 · 2a98098035
commit 2a98098035
parent 6c89306437
2 changed files with 85 additions and 9 deletions
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@ -1128,7 +1128,62 @@ def systemd_restart(system: bool = False):

    pid = get_running_pid()
    if pid is not None and _request_gateway_self_restart(pid):
-        print(f"✓ {_service_scope_label(system).capitalize()} service restart requested")
+        # SIGUSR1 sent — the gateway will drain active agents, exit with
+        # code 75, and systemd will restart it after RestartSec (30s).
+        # Wait for the old process to die and the new one to become active
+        # so the CLI doesn't return while the service is still restarting.
+        import time
+        scope_label = _service_scope_label(system).capitalize()
+        svc = get_service_name()
+        scope_cmd = _systemctl_cmd(system)
+
+        # Phase 1: wait for old process to exit (drain + shutdown)
+        print(f"⏳ {scope_label} service draining active work...")
+        deadline = time.time() + 90
+        while time.time() < deadline:
+            try:
+                os.kill(pid, 0)
+                time.sleep(1)
+            except (ProcessLookupError, PermissionError):
+                break  # old process is gone
+        else:
+            print(f"⚠ Old process (PID {pid}) still alive after 90s")
+
+        # Phase 2: wait for systemd to start the new process
+        print(f"⏳ Waiting for {svc} to restart...")
+        deadline = time.time() + 60
+        while time.time() < deadline:
+            try:
+                result = subprocess.run(
+                    scope_cmd + ["is-active", svc],
+                    capture_output=True, text=True, timeout=5,
+                )
+                if result.stdout.strip() == "active":
+                    # Verify it's a NEW process, not the old one somehow
+                    new_pid = get_running_pid()
+                    if new_pid and new_pid != pid:
+                        print(f"✓ {scope_label} service restarted (PID {new_pid})")
+                        return
+            except (subprocess.TimeoutExpired, FileNotFoundError):
+                pass
+            time.sleep(2)
+
+        # Timed out — check final state
+        try:
+            result = subprocess.run(
+                scope_cmd + ["is-active", svc],
+                capture_output=True, text=True, timeout=5,
+            )
+            if result.stdout.strip() == "active":
+                print(f"✓ {scope_label} service restarted")
+                return
+        except Exception:
+            pass
+        print(
+            f"⚠ {scope_label} service did not become active within 60s.\n"
+            f"  Check status: {'sudo ' if system else ''}hermes gateway status\n"
+            f"  Check logs:   journalctl {'--user ' if not system else ''}-u {svc} --since '2 min ago'"
+        )
        return
    _run_systemctl(["reload-or-restart", get_service_name()], system=system, check=True, timeout=90)
    print(f"✓ {_service_scope_label(system).capitalize()} service restarted")
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@ -452,7 +452,7 @@ class TestGatewayServiceDetection:


 class TestGatewaySystemServiceRouting:
-    def test_systemd_restart_self_requests_graceful_restart_without_reload_or_restart(self, monkeypatch, capsys):
+    def test_systemd_restart_self_requests_graceful_restart_and_waits(self, monkeypatch, capsys):
        calls = []

        monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False)
@ -466,16 +466,37 @@ class TestGatewaySystemServiceRouting:
            "_request_gateway_self_restart",
            lambda pid: calls.append(("self", pid)) or True,
        )
-        monkeypatch.setattr(
-            gateway_cli.subprocess,
-            "run",
-            lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("systemctl should not run")),
-        )
+
+        # Simulate: old process dies immediately, new process becomes active
+        kill_call_count = [0]
+        def fake_kill(pid, sig):
+            kill_call_count[0] += 1
+            if kill_call_count[0] >= 2:  # first call checks, second = dead
+                raise ProcessLookupError()
+        monkeypatch.setattr(os, "kill", fake_kill)
+
+        # Simulate systemctl is-active returning "active" with a new PID
+        new_pid = [None]
+        def fake_subprocess_run(cmd, **kwargs):
+            if "is-active" in cmd:
+                result = SimpleNamespace(stdout="active\n", returncode=0)
+                new_pid[0] = 999  # new PID
+                return result
+            raise AssertionError(f"Unexpected systemctl call: {cmd}")
+
+        monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run)
+        # get_running_pid returns new PID after restart
+        pid_calls = [0]
+        def fake_get_pid():
+            pid_calls[0] += 1
+            return 999 if pid_calls[0] > 1 else 654
+        monkeypatch.setattr("gateway.status.get_running_pid", fake_get_pid)

        gateway_cli.systemd_restart()

-        assert calls == [("refresh", False), ("self", 654)]
-        assert "restart requested" in capsys.readouterr().out.lower()
+        assert ("self", 654) in calls
+        out = capsys.readouterr().out.lower()
+        assert "restarted" in out

    def test_gateway_install_passes_system_flags(self, monkeypatch):
        monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)