From 96691268dffa40df7110bcab6bdf63ada260a06d Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 30 Apr 2026 19:57:42 -0700 Subject: [PATCH] fix(gateway): drain manual profile gateways via SIGUSR1 before respawn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PR wired in a detached watcher that respawns manual profile gateways after they exit. Pair that with a SIGUSR1 graceful drain (same path systemd/launchd use) so in-flight agent runs finish instead of getting SIGTERM'd. Fall back to SIGTERM if SIGUSR1 isn't wired or the gateway doesn't exit within the drain budget — the watcher sees the exit and relaunches either way. Tested end-to-end against an orphaned gateway: graceful drain exits in 0.5s and the watcher fires the relaunch command. --- hermes_cli/main.py | 16 +++++-- .../hermes_cli/test_update_gateway_restart.py | 42 ++++++++++++++++++- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 318d893742..79ef21eec7 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7438,13 +7438,23 @@ def _cmd_update_impl(args, gateway_mode: bool): if proc.pid in manual_pids } for pid, proc in profile_processes.items(): - if launch_detached_profile_gateway_restart(proc.profile, pid): + if not launch_detached_profile_gateway_restart(proc.profile, pid): + continue + # Prefer a graceful SIGUSR1 drain so in-flight agent runs + # finish before the watcher respawns the gateway. If the + # gateway doesn't support SIGUSR1 or doesn't exit within + # the drain budget, fall back to SIGTERM — the watcher + # still sees the exit and relaunches either way. + drained = _graceful_restart_via_sigusr1( + pid, drain_timeout=_drain_budget, + ) + if not drained: try: os.kill(pid, _signal.SIGTERM) - killed_pids.add(pid) - relaunched_profiles.append(proc.profile) except (ProcessLookupError, PermissionError): pass + killed_pids.add(pid) + relaunched_profiles.append(proc.profile) for pid in manual_pids: if pid in profile_processes: diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py index e8af435ea5..721149ddef 100644 --- a/tests/hermes_cli/test_update_gateway_restart.py +++ b/tests/hermes_cli/test_update_gateway_restart.py @@ -418,15 +418,55 @@ class TestCmdUpdateLaunchdRestart: with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \ patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \ patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \ + patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=True) as graceful, \ patch("os.kill") as kill: cmd_update(mock_args) captured = capsys.readouterr().out restart.assert_called_once_with("coder", 12345) - kill.assert_called_once() + graceful.assert_called_once() + # Graceful drain succeeded — no SIGTERM fallback needed. + kill.assert_not_called() assert "Restarting manual gateway profile(s): coder" in captured assert "Restart manually: hermes gateway run" not in captured + @patch("shutil.which", return_value=None) + @patch("subprocess.run") + def test_update_profile_manual_gateway_falls_back_to_sigterm( + self, mock_run, _mock_which, mock_args, capsys, tmp_path, monkeypatch, + ): + """When graceful SIGUSR1 drain fails, manual profile restart falls back to SIGTERM.""" + monkeypatch.setattr(gateway_cli, "is_macos", lambda: True) + monkeypatch.setattr( + gateway_cli, + "get_launchd_plist_path", + lambda: tmp_path / "ai.hermes.gateway.plist", + ) + + mock_run.side_effect = _make_run_side_effect( + commit_count="3", + launchctl_loaded=False, + ) + process = gateway_cli.ProfileGatewayProcess( + profile="coder", + path=tmp_path / ".hermes" / "profiles" / "coder", + pid=12345, + ) + + with patch.object(gateway_cli, "find_gateway_pids", return_value=[12345]), \ + patch.object(gateway_cli, "find_profile_gateway_processes", return_value=[process]), \ + patch.object(gateway_cli, "launch_detached_profile_gateway_restart", return_value=True) as restart, \ + patch.object(gateway_cli, "_graceful_restart_via_sigusr1", return_value=False) as graceful, \ + patch("os.kill") as kill: + cmd_update(mock_args) + + captured = capsys.readouterr().out + restart.assert_called_once_with("coder", 12345) + graceful.assert_called_once() + # Graceful drain returned False → SIGTERM fallback. + kill.assert_called_once() + assert "Restarting manual gateway profile(s): coder" in captured + @patch("shutil.which", return_value=None) @patch("subprocess.run") def test_update_with_systemd_still_restarts_via_systemd(