From abc3662bf6076045e4d4dc1e14a74cb35d69b86e Mon Sep 17 00:00:00 2001 From: Chaz Dinkle Date: Wed, 10 Jun 2026 21:22:16 -0400 Subject: [PATCH] fix(gateway): detect launchd in /restart service-manager probe (#43475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a launchd-managed gateway (macOS), /restart stopped the gateway but never relaunched it: the handler's service detection checks only INVOCATION_ID (systemd) and container markers, so under launchd it takes the detached path and exits 0 — which KeepAlive.SuccessfulExit=false treats as a deliberate stop. The gateway stays silently dead until a manual launchctl kickstart. Detect launchd via XPC_SERVICE_NAME, which launchd sets to the job label for processes it spawns. The probe deliberately excludes the literal "0": interactive macOS shells inherit XPC_SERVICE_NAME=0 (a truthy string), and routing an unsupervised interactive gateway to the service path would make it exit non-zero with nothing to revive it. Routing through via_service=True (rather than forcing a non-zero exit on the detached path) matters: the detached path also spawns a helper that relaunches the gateway, so exiting non-zero there would have BOTH the helper and launchd respawn it — two gateways racing for the same bot tokens. The service path spawns no helper; launchd is the single respawner. Fixes #43475. Supersedes the run.py-era probes in #19940/#33393 (the handler has since moved to gateway/slash_commands.py) and avoids the double-spawn risk in the exit-code-site approaches (#43498, #43596). --- gateway/slash_commands.py | 10 ++- .../gateway/test_restart_service_detection.py | 85 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 tests/gateway/test_restart_service_detection.py diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py index c7420bc645e..b58fa457705 100644 --- a/gateway/slash_commands.py +++ b/gateway/slash_commands.py @@ -960,7 +960,15 @@ class GatewaySlashCommandsMixin: # us. The detached subprocess approach (setsid + bash) doesn't work # under systemd (KillMode=mixed kills the cgroup) or Docker (tini # exits when the gateway dies, taking the detached helper with it). - _under_service = bool(os.environ.get("INVOCATION_ID")) # systemd sets this + # systemd sets INVOCATION_ID; launchd sets XPC_SERVICE_NAME to the + # job label. Without the launchd check, macOS /restart takes the + # detached path and exits 0, which KeepAlive.SuccessfulExit=false + # treats as a deliberate stop — the gateway stays dead until next + # login. Interactive macOS shells inherit XPC_SERVICE_NAME=0, so + # "0" must count as not-under-launchd. + _under_service = bool(os.environ.get("INVOCATION_ID")) or os.environ.get( + "XPC_SERVICE_NAME", "0" + ) not in ("", "0") _in_container = os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv") if _under_service or _in_container: self.request_restart(detached=False, via_service=True) diff --git a/tests/gateway/test_restart_service_detection.py b/tests/gateway/test_restart_service_detection.py new file mode 100644 index 00000000000..10ea2ff28f7 --- /dev/null +++ b/tests/gateway/test_restart_service_detection.py @@ -0,0 +1,85 @@ +"""Tests for /restart service-manager detection (launchd vs interactive). + +The /restart handler routes through ``request_restart(via_service=True)`` +when a service manager supervises the gateway, so the process exits with +the service-restart code and the manager relaunches it. Under macOS +launchd the plist uses ``KeepAlive.SuccessfulExit=false`` — a clean exit 0 +is treated as a deliberate stop and the gateway stays dead (#43475) — so +launchd must be detected here in the handler, not only at the exit-code +site (which never runs unless ``via_service=True`` is already set). + +launchd sets ``XPC_SERVICE_NAME`` to the job label for processes it +spawns. Interactive macOS shells inherit ``XPC_SERVICE_NAME=0`` (a +truthy string), so the probe must treat ``"0"`` as not-under-launchd: +routing an unsupervised interactive gateway to the service path would +make it exit non-zero with nothing to revive it. +""" +from unittest.mock import MagicMock + +import pytest + +import gateway.run as gateway_run +from gateway.platforms.base import MessageEvent, MessageType +from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source + + +def _make_restart_event(update_id: int | None = 100) -> MessageEvent: + return MessageEvent( + text="/restart", + message_type=MessageType.TEXT, + source=make_restart_source(), + message_id="m1", + platform_update_id=update_id, + ) + + +def _make_runner_with_mock_restart(tmp_path, monkeypatch): + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + monkeypatch.delenv("INVOCATION_ID", raising=False) + monkeypatch.delenv("XPC_SERVICE_NAME", raising=False) + runner, _adapter = make_restart_runner() + runner.request_restart = MagicMock(return_value=True) + return runner + + +@pytest.mark.asyncio +async def test_restart_under_launchd_uses_service_path(tmp_path, monkeypatch): + """launchd job label in XPC_SERVICE_NAME routes /restart via the service path.""" + runner = _make_runner_with_mock_restart(tmp_path, monkeypatch) + monkeypatch.setenv("XPC_SERVICE_NAME", "ai.hermes.gateway") + + await runner._handle_restart_command(_make_restart_event()) + + runner.request_restart.assert_called_once_with(detached=False, via_service=True) + + +@pytest.mark.asyncio +async def test_restart_in_interactive_macos_shell_uses_detached_path(tmp_path, monkeypatch): + """XPC_SERVICE_NAME=0 (inherited by interactive macOS shells) is NOT a service.""" + runner = _make_runner_with_mock_restart(tmp_path, monkeypatch) + monkeypatch.setenv("XPC_SERVICE_NAME", "0") + + await runner._handle_restart_command(_make_restart_event()) + + runner.request_restart.assert_called_once_with(detached=True, via_service=False) + + +@pytest.mark.asyncio +async def test_restart_without_service_env_uses_detached_path(tmp_path, monkeypatch): + """No service-manager env at all falls back to the detached restart.""" + runner = _make_runner_with_mock_restart(tmp_path, monkeypatch) + + await runner._handle_restart_command(_make_restart_event()) + + runner.request_restart.assert_called_once_with(detached=True, via_service=False) + + +@pytest.mark.asyncio +async def test_restart_under_systemd_uses_service_path(tmp_path, monkeypatch): + """INVOCATION_ID (systemd) still routes via the service path.""" + runner = _make_runner_with_mock_restart(tmp_path, monkeypatch) + monkeypatch.setenv("INVOCATION_ID", "abc123") + + await runner._handle_restart_command(_make_restart_event()) + + runner.request_restart.assert_called_once_with(detached=False, via_service=True)