From e48803daec3a1dccae24e2af09e1b467593f4de3 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 17 Jun 2026 05:19:21 -0700
Subject: [PATCH] fix(gateway): defer macOS launchd reload when run inside the
gateway tree (#47842)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When refresh_launchd_plist_if_needed() runs from inside the gateway's own
launchd process tree (agent-initiated self-update via the terminal tool), a
direct launchctl bootout tears down the service's process group — including
the CLI doing the refresh — before the follow-up bootstrap can run. The
gateway is left unloaded and KeepAlive can't revive it (#43842).
Detect in-service execution via gateway.status.get_running_pid() +
_is_pid_ancestor_of_current_process(), and delegate the bootout->bootstrap to
a detached (start_new_session=True) helper that survives the process-group
teardown. The normal out-of-tree CLI path is unchanged.
Fixes #43842.
---
hermes_cli/gateway.py | 51 ++++++++++-
tests/hermes_cli/test_gateway_service.py | 106 +++++++++++++++++++++++
2 files changed, 155 insertions(+), 2 deletions(-)
diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index d5b76574a12..7e5406a11dd 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -7,6 +7,7 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup]
import asyncio
import logging
import os
+import shlex
import shutil
import signal
import subprocess
@@ -3467,14 +3468,60 @@ def refresh_launchd_plist_if_needed() -> bool:
plist_path.write_text(new_plist, encoding="utf-8")
label = get_launchd_label()
+ domain = _launchd_domain()
+ target = f"{domain}/{label}"
+
+ # If this refresh is running INSIDE the gateway's own launchd process tree
+ # (e.g. the agent triggered a self-update via its terminal tool), a direct
+ # `launchctl bootout` tears down the service's process group — which
+ # includes THIS CLI — before the follow-up `bootstrap` can run. The gateway
+ # then stays unloaded and KeepAlive can't revive it (#43842). Detect that
+ # case and hand the reload to a detached session that survives the bootout.
+ gateway_pid = None
+ try:
+ from gateway.status import get_running_pid
+ gateway_pid = get_running_pid()
+ except Exception:
+ gateway_pid = None
+
+ if (
+ gateway_pid is not None
+ and _is_pid_ancestor_of_current_process(gateway_pid)
+ and hasattr(os, "setsid") # POSIX-only; launchd is macOS so always true here
+ ):
+ # Delegate to a new session: `start_new_session=True` detaches the
+ # helper from the gateway's process group, so the bootout that kills
+ # the gateway (and us) does not kill the helper before it bootstraps.
+ reload_script = (
+ f"sleep 2; "
+ f"launchctl bootout {shlex.quote(target)} 2>/dev/null; "
+ f"sleep 1; "
+ f"launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null"
+ )
+ try:
+ subprocess.Popen(
+ ["/bin/bash", "-c", reload_script],
+ start_new_session=True,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+ except Exception as e:
+ logger.warning("Deferred launchd reload could not be spawned: %s", e)
+ return False
+ print(
+ "↻ Updated gateway launchd service definition; reload deferred to a "
+ "detached helper (refresh ran inside the gateway process tree)"
+ )
+ return True
+
# Bootout/bootstrap so launchd picks up the new definition
subprocess.run(
- ["launchctl", "bootout", f"{_launchd_domain()}/{label}"],
+ ["launchctl", "bootout", target],
check=False,
timeout=90,
)
subprocess.run(
- ["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],
+ ["launchctl", "bootstrap", domain, str(plist_path)],
check=False,
timeout=30,
)
diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py
index f9cdcc1f313..6dd504a5f70 100644
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -599,6 +599,8 @@ class TestLaunchdServiceRecovery:
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+ # Not running inside the gateway tree → direct bootout/bootstrap path.
+ monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: None)
gateway_cli.launchd_install()
@@ -613,6 +615,110 @@ class TestLaunchdServiceRecovery:
["launchctl", "bootstrap", domain, str(plist_path)],
]
+ def test_refresh_defers_reload_when_running_inside_gateway_tree(self, tmp_path, monkeypatch):
+ """#43842: when the refresh runs inside the gateway's own process tree,
+ a direct bootout would kill this CLI before bootstrap. The reload must
+ be delegated to a detached helper instead."""
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("old content", encoding="utf-8")
+
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+ monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
+ monkeypatch.setattr(
+ gateway_cli,
+ "generate_launchd_plist",
+ lambda: (
+ "--replace\nHERMES_HOME"
+ "/Users/alice/.hermes"
+ ),
+ )
+ # Pretend the gateway is running and that we ARE inside its tree.
+ monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
+ monkeypatch.setattr(
+ gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: pid == 4242
+ )
+
+ run_calls = []
+
+ def fake_run(cmd, check=False, **kwargs):
+ run_calls.append(cmd)
+ return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ popen_calls = []
+
+ def fake_popen(cmd, **kwargs):
+ popen_calls.append((cmd, kwargs))
+ return SimpleNamespace(pid=9999)
+
+ monkeypatch.setattr(gateway_cli.subprocess, "Popen", fake_popen)
+
+ result = gateway_cli.refresh_launchd_plist_if_needed()
+
+ assert result is True
+ # The new plist was written.
+ assert "--replace" in plist_path.read_text(encoding="utf-8")
+ # No DIRECT bootout/bootstrap ran (those would kill us mid-sequence).
+ assert not [c for c in run_calls if "bootout" in c or "bootstrap" in c]
+ # Exactly one detached helper was spawned, in a new session, and it
+ # performs both bootout and bootstrap.
+ assert len(popen_calls) == 1
+ cmd, kwargs = popen_calls[0]
+ assert kwargs.get("start_new_session") is True
+ script = cmd[-1]
+ assert "bootout" in script and "bootstrap" in script
+ assert str(plist_path) in script
+
+ def test_refresh_uses_direct_reload_when_not_inside_gateway_tree(self, tmp_path, monkeypatch):
+ """Normal CLI-initiated refresh (outside the service tree) keeps the
+ direct synchronous bootout/bootstrap path."""
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("old content", encoding="utf-8")
+
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+ monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
+ monkeypatch.setattr(
+ gateway_cli,
+ "generate_launchd_plist",
+ lambda: (
+ "--replace\nHERMES_HOME"
+ "/Users/alice/.hermes"
+ ),
+ )
+ # Gateway running, but we are NOT inside its tree.
+ monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
+ monkeypatch.setattr(
+ gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: False
+ )
+
+ run_calls = []
+
+ def fake_run(cmd, check=False, **kwargs):
+ run_calls.append(cmd)
+ return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ popen_calls = []
+ monkeypatch.setattr(
+ gateway_cli.subprocess, "Popen",
+ lambda cmd, **kw: popen_calls.append(cmd) or SimpleNamespace(pid=1),
+ )
+
+ result = gateway_cli.refresh_launchd_plist_if_needed()
+
+ assert result is True
+ # No detached helper — direct path taken.
+ assert not popen_calls
+ label = gateway_cli.get_launchd_label()
+ domain = gateway_cli._launchd_domain()
+ service_calls = [c for c in run_calls if "bootout" in c or "bootstrap" in c]
+ assert service_calls[:2] == [
+ ["launchctl", "bootout", f"{domain}/{label}"],
+ ["launchctl", "bootstrap", domain, str(plist_path)],
+ ]
+
def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")