fix(gateway): detect launchd in /restart service-manager probe (#43475)

On a launchd-managed gateway (macOS), /restart stopped the gateway but
never relaunched it: the handler's service detection checks only
INVOCATION_ID (systemd) and container markers, so under launchd it takes
the detached path and exits 0 — which KeepAlive.SuccessfulExit=false
treats as a deliberate stop. The gateway stays silently dead until a
manual launchctl kickstart.

Detect launchd via XPC_SERVICE_NAME, which launchd sets to the job label
for processes it spawns. The probe deliberately excludes the literal
"0": interactive macOS shells inherit XPC_SERVICE_NAME=0 (a truthy
string), and routing an unsupervised interactive gateway to the service
path would make it exit non-zero with nothing to revive it.

Routing through via_service=True (rather than forcing a non-zero exit
on the detached path) matters: the detached path also spawns a helper
that relaunches the gateway, so exiting non-zero there would have BOTH
the helper and launchd respawn it — two gateways racing for the same
bot tokens. The service path spawns no helper; launchd is the single
respawner.

Fixes #43475. Supersedes the run.py-era probes in #19940/#33393 (the
handler has since moved to gateway/slash_commands.py) and avoids the
double-spawn risk in the exit-code-site approaches (#43498, #43596).
This commit is contained in:
Chaz Dinkle 2026-06-10 21:22:16 -04:00 committed by Teknium
parent 73a20a6ad6
commit abc3662bf6
2 changed files with 94 additions and 1 deletions

View file

@ -960,7 +960,15 @@ class GatewaySlashCommandsMixin:
# us. The detached subprocess approach (setsid + bash) doesn't work
# under systemd (KillMode=mixed kills the cgroup) or Docker (tini
# exits when the gateway dies, taking the detached helper with it).
_under_service = bool(os.environ.get("INVOCATION_ID")) # systemd sets this
# systemd sets INVOCATION_ID; launchd sets XPC_SERVICE_NAME to the
# job label. Without the launchd check, macOS /restart takes the
# detached path and exits 0, which KeepAlive.SuccessfulExit=false
# treats as a deliberate stop — the gateway stays dead until next
# login. Interactive macOS shells inherit XPC_SERVICE_NAME=0, so
# "0" must count as not-under-launchd.
_under_service = bool(os.environ.get("INVOCATION_ID")) or os.environ.get(
"XPC_SERVICE_NAME", "0"
) not in ("", "0")
_in_container = os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv")
if _under_service or _in_container:
self.request_restart(detached=False, via_service=True)

View file

@ -0,0 +1,85 @@
"""Tests for /restart service-manager detection (launchd vs interactive).
The /restart handler routes through ``request_restart(via_service=True)``
when a service manager supervises the gateway, so the process exits with
the service-restart code and the manager relaunches it. Under macOS
launchd the plist uses ``KeepAlive.SuccessfulExit=false`` a clean exit 0
is treated as a deliberate stop and the gateway stays dead (#43475) — so
launchd must be detected here in the handler, not only at the exit-code
site (which never runs unless ``via_service=True`` is already set).
launchd sets ``XPC_SERVICE_NAME`` to the job label for processes it
spawns. Interactive macOS shells inherit ``XPC_SERVICE_NAME=0`` (a
truthy string), so the probe must treat ``"0"`` as not-under-launchd:
routing an unsupervised interactive gateway to the service path would
make it exit non-zero with nothing to revive it.
"""
from unittest.mock import MagicMock
import pytest
import gateway.run as gateway_run
from gateway.platforms.base import MessageEvent, MessageType
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
def _make_restart_event(update_id: int | None = 100) -> MessageEvent:
return MessageEvent(
text="/restart",
message_type=MessageType.TEXT,
source=make_restart_source(),
message_id="m1",
platform_update_id=update_id,
)
def _make_runner_with_mock_restart(tmp_path, monkeypatch):
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
monkeypatch.delenv("XPC_SERVICE_NAME", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
return runner
@pytest.mark.asyncio
async def test_restart_under_launchd_uses_service_path(tmp_path, monkeypatch):
"""launchd job label in XPC_SERVICE_NAME routes /restart via the service path."""
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
monkeypatch.setenv("XPC_SERVICE_NAME", "ai.hermes.gateway")
await runner._handle_restart_command(_make_restart_event())
runner.request_restart.assert_called_once_with(detached=False, via_service=True)
@pytest.mark.asyncio
async def test_restart_in_interactive_macos_shell_uses_detached_path(tmp_path, monkeypatch):
"""XPC_SERVICE_NAME=0 (inherited by interactive macOS shells) is NOT a service."""
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
monkeypatch.setenv("XPC_SERVICE_NAME", "0")
await runner._handle_restart_command(_make_restart_event())
runner.request_restart.assert_called_once_with(detached=True, via_service=False)
@pytest.mark.asyncio
async def test_restart_without_service_env_uses_detached_path(tmp_path, monkeypatch):
"""No service-manager env at all falls back to the detached restart."""
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
await runner._handle_restart_command(_make_restart_event())
runner.request_restart.assert_called_once_with(detached=True, via_service=False)
@pytest.mark.asyncio
async def test_restart_under_systemd_uses_service_path(tmp_path, monkeypatch):
"""INVOCATION_ID (systemd) still routes via the service path."""
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
monkeypatch.setenv("INVOCATION_ID", "abc123")
await runner._handle_restart_command(_make_restart_event())
runner.request_restart.assert_called_once_with(detached=False, via_service=True)