mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): detect launchd in /restart service-manager probe (#43475)
On a launchd-managed gateway (macOS), /restart stopped the gateway but never relaunched it: the handler's service detection checks only INVOCATION_ID (systemd) and container markers, so under launchd it takes the detached path and exits 0 — which KeepAlive.SuccessfulExit=false treats as a deliberate stop. The gateway stays silently dead until a manual launchctl kickstart. Detect launchd via XPC_SERVICE_NAME, which launchd sets to the job label for processes it spawns. The probe deliberately excludes the literal "0": interactive macOS shells inherit XPC_SERVICE_NAME=0 (a truthy string), and routing an unsupervised interactive gateway to the service path would make it exit non-zero with nothing to revive it. Routing through via_service=True (rather than forcing a non-zero exit on the detached path) matters: the detached path also spawns a helper that relaunches the gateway, so exiting non-zero there would have BOTH the helper and launchd respawn it — two gateways racing for the same bot tokens. The service path spawns no helper; launchd is the single respawner. Fixes #43475. Supersedes the run.py-era probes in #19940/#33393 (the handler has since moved to gateway/slash_commands.py) and avoids the double-spawn risk in the exit-code-site approaches (#43498, #43596).
This commit is contained in:
parent
73a20a6ad6
commit
abc3662bf6
2 changed files with 94 additions and 1 deletions
|
|
@ -960,7 +960,15 @@ class GatewaySlashCommandsMixin:
|
|||
# us. The detached subprocess approach (setsid + bash) doesn't work
|
||||
# under systemd (KillMode=mixed kills the cgroup) or Docker (tini
|
||||
# exits when the gateway dies, taking the detached helper with it).
|
||||
_under_service = bool(os.environ.get("INVOCATION_ID")) # systemd sets this
|
||||
# systemd sets INVOCATION_ID; launchd sets XPC_SERVICE_NAME to the
|
||||
# job label. Without the launchd check, macOS /restart takes the
|
||||
# detached path and exits 0, which KeepAlive.SuccessfulExit=false
|
||||
# treats as a deliberate stop — the gateway stays dead until next
|
||||
# login. Interactive macOS shells inherit XPC_SERVICE_NAME=0, so
|
||||
# "0" must count as not-under-launchd.
|
||||
_under_service = bool(os.environ.get("INVOCATION_ID")) or os.environ.get(
|
||||
"XPC_SERVICE_NAME", "0"
|
||||
) not in ("", "0")
|
||||
_in_container = os.path.exists("/.dockerenv") or os.path.exists("/run/.containerenv")
|
||||
if _under_service or _in_container:
|
||||
self.request_restart(detached=False, via_service=True)
|
||||
|
|
|
|||
85
tests/gateway/test_restart_service_detection.py
Normal file
85
tests/gateway/test_restart_service_detection.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
"""Tests for /restart service-manager detection (launchd vs interactive).
|
||||
|
||||
The /restart handler routes through ``request_restart(via_service=True)``
|
||||
when a service manager supervises the gateway, so the process exits with
|
||||
the service-restart code and the manager relaunches it. Under macOS
|
||||
launchd the plist uses ``KeepAlive.SuccessfulExit=false`` — a clean exit 0
|
||||
is treated as a deliberate stop and the gateway stays dead (#43475) — so
|
||||
launchd must be detected here in the handler, not only at the exit-code
|
||||
site (which never runs unless ``via_service=True`` is already set).
|
||||
|
||||
launchd sets ``XPC_SERVICE_NAME`` to the job label for processes it
|
||||
spawns. Interactive macOS shells inherit ``XPC_SERVICE_NAME=0`` (a
|
||||
truthy string), so the probe must treat ``"0"`` as not-under-launchd:
|
||||
routing an unsupervised interactive gateway to the service path would
|
||||
make it exit non-zero with nothing to revive it.
|
||||
"""
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
import gateway.run as gateway_run
|
||||
from gateway.platforms.base import MessageEvent, MessageType
|
||||
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
|
||||
|
||||
|
||||
def _make_restart_event(update_id: int | None = 100) -> MessageEvent:
|
||||
return MessageEvent(
|
||||
text="/restart",
|
||||
message_type=MessageType.TEXT,
|
||||
source=make_restart_source(),
|
||||
message_id="m1",
|
||||
platform_update_id=update_id,
|
||||
)
|
||||
|
||||
|
||||
def _make_runner_with_mock_restart(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
||||
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
||||
monkeypatch.delenv("XPC_SERVICE_NAME", raising=False)
|
||||
runner, _adapter = make_restart_runner()
|
||||
runner.request_restart = MagicMock(return_value=True)
|
||||
return runner
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restart_under_launchd_uses_service_path(tmp_path, monkeypatch):
|
||||
"""launchd job label in XPC_SERVICE_NAME routes /restart via the service path."""
|
||||
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
|
||||
monkeypatch.setenv("XPC_SERVICE_NAME", "ai.hermes.gateway")
|
||||
|
||||
await runner._handle_restart_command(_make_restart_event())
|
||||
|
||||
runner.request_restart.assert_called_once_with(detached=False, via_service=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restart_in_interactive_macos_shell_uses_detached_path(tmp_path, monkeypatch):
|
||||
"""XPC_SERVICE_NAME=0 (inherited by interactive macOS shells) is NOT a service."""
|
||||
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
|
||||
monkeypatch.setenv("XPC_SERVICE_NAME", "0")
|
||||
|
||||
await runner._handle_restart_command(_make_restart_event())
|
||||
|
||||
runner.request_restart.assert_called_once_with(detached=True, via_service=False)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restart_without_service_env_uses_detached_path(tmp_path, monkeypatch):
|
||||
"""No service-manager env at all falls back to the detached restart."""
|
||||
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
|
||||
|
||||
await runner._handle_restart_command(_make_restart_event())
|
||||
|
||||
runner.request_restart.assert_called_once_with(detached=True, via_service=False)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restart_under_systemd_uses_service_path(tmp_path, monkeypatch):
|
||||
"""INVOCATION_ID (systemd) still routes via the service path."""
|
||||
runner = _make_runner_with_mock_restart(tmp_path, monkeypatch)
|
||||
monkeypatch.setenv("INVOCATION_ID", "abc123")
|
||||
|
||||
await runner._handle_restart_command(_make_restart_event())
|
||||
|
||||
runner.request_restart.assert_called_once_with(detached=False, via_service=True)
|
||||
Loading…
Add table
Add a link
Reference in a new issue