fix(gateway): defer macOS launchd reload when run inside the gateway tree (#47842)

When refresh_launchd_plist_if_needed() runs from inside the gateway's own
launchd process tree (agent-initiated self-update via the terminal tool), a
direct launchctl bootout tears down the service's process group — including
the CLI doing the refresh — before the follow-up bootstrap can run. The
gateway is left unloaded and KeepAlive can't revive it (#43842).

Detect in-service execution via gateway.status.get_running_pid() +
_is_pid_ancestor_of_current_process(), and delegate the bootout->bootstrap to
a detached (start_new_session=True) helper that survives the process-group
teardown. The normal out-of-tree CLI path is unchanged.

Fixes #43842.
This commit is contained in:
Teknium 2026-06-17 05:19:21 -07:00 committed by GitHub
parent 4d39a603d1
commit e48803daec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 155 additions and 2 deletions

View file

@ -7,6 +7,7 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup]
import asyncio
import logging
import os
import shlex
import shutil
import signal
import subprocess
@ -3467,14 +3468,60 @@ def refresh_launchd_plist_if_needed() -> bool:
plist_path.write_text(new_plist, encoding="utf-8")
label = get_launchd_label()
domain = _launchd_domain()
target = f"{domain}/{label}"
# If this refresh is running INSIDE the gateway's own launchd process tree
# (e.g. the agent triggered a self-update via its terminal tool), a direct
# `launchctl bootout` tears down the service's process group — which
# includes THIS CLI — before the follow-up `bootstrap` can run. The gateway
# then stays unloaded and KeepAlive can't revive it (#43842). Detect that
# case and hand the reload to a detached session that survives the bootout.
gateway_pid = None
try:
from gateway.status import get_running_pid
gateway_pid = get_running_pid()
except Exception:
gateway_pid = None
if (
gateway_pid is not None
and _is_pid_ancestor_of_current_process(gateway_pid)
and hasattr(os, "setsid") # POSIX-only; launchd is macOS so always true here
):
# Delegate to a new session: `start_new_session=True` detaches the
# helper from the gateway's process group, so the bootout that kills
# the gateway (and us) does not kill the helper before it bootstraps.
reload_script = (
f"sleep 2; "
f"launchctl bootout {shlex.quote(target)} 2>/dev/null; "
f"sleep 1; "
f"launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null"
)
try:
subprocess.Popen(
["/bin/bash", "-c", reload_script],
start_new_session=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except Exception as e:
logger.warning("Deferred launchd reload could not be spawned: %s", e)
return False
print(
"↻ Updated gateway launchd service definition; reload deferred to a "
"detached helper (refresh ran inside the gateway process tree)"
)
return True
# Bootout/bootstrap so launchd picks up the new definition
subprocess.run(
["launchctl", "bootout", f"{_launchd_domain()}/{label}"],
["launchctl", "bootout", target],
check=False,
timeout=90,
)
subprocess.run(
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],
["launchctl", "bootstrap", domain, str(plist_path)],
check=False,
timeout=30,
)

View file

@ -599,6 +599,8 @@ class TestLaunchdServiceRecovery:
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
# Not running inside the gateway tree → direct bootout/bootstrap path.
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: None)
gateway_cli.launchd_install()
@ -613,6 +615,110 @@ class TestLaunchdServiceRecovery:
["launchctl", "bootstrap", domain, str(plist_path)],
]
def test_refresh_defers_reload_when_running_inside_gateway_tree(self, tmp_path, monkeypatch):
"""#43842: when the refresh runs inside the gateway's own process tree,
a direct bootout would kill this CLI before bootstrap. The reload must
be delegated to a detached helper instead."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
monkeypatch.setattr(
gateway_cli,
"generate_launchd_plist",
lambda: (
"<plist>--replace\n<key>HERMES_HOME</key>"
"<string>/Users/alice/.hermes</string></plist>"
),
)
# Pretend the gateway is running and that we ARE inside its tree.
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
monkeypatch.setattr(
gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: pid == 4242
)
run_calls = []
def fake_run(cmd, check=False, **kwargs):
run_calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
popen_calls = []
def fake_popen(cmd, **kwargs):
popen_calls.append((cmd, kwargs))
return SimpleNamespace(pid=9999)
monkeypatch.setattr(gateway_cli.subprocess, "Popen", fake_popen)
result = gateway_cli.refresh_launchd_plist_if_needed()
assert result is True
# The new plist was written.
assert "--replace" in plist_path.read_text(encoding="utf-8")
# No DIRECT bootout/bootstrap ran (those would kill us mid-sequence).
assert not [c for c in run_calls if "bootout" in c or "bootstrap" in c]
# Exactly one detached helper was spawned, in a new session, and it
# performs both bootout and bootstrap.
assert len(popen_calls) == 1
cmd, kwargs = popen_calls[0]
assert kwargs.get("start_new_session") is True
script = cmd[-1]
assert "bootout" in script and "bootstrap" in script
assert str(plist_path) in script
def test_refresh_uses_direct_reload_when_not_inside_gateway_tree(self, tmp_path, monkeypatch):
"""Normal CLI-initiated refresh (outside the service tree) keeps the
direct synchronous bootout/bootstrap path."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
monkeypatch.setattr(
gateway_cli,
"generate_launchd_plist",
lambda: (
"<plist>--replace\n<key>HERMES_HOME</key>"
"<string>/Users/alice/.hermes</string></plist>"
),
)
# Gateway running, but we are NOT inside its tree.
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
monkeypatch.setattr(
gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: False
)
run_calls = []
def fake_run(cmd, check=False, **kwargs):
run_calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
popen_calls = []
monkeypatch.setattr(
gateway_cli.subprocess, "Popen",
lambda cmd, **kw: popen_calls.append(cmd) or SimpleNamespace(pid=1),
)
result = gateway_cli.refresh_launchd_plist_if_needed()
assert result is True
# No detached helper — direct path taken.
assert not popen_calls
label = gateway_cli.get_launchd_label()
domain = gateway_cli._launchd_domain()
service_calls = [c for c in run_calls if "bootout" in c or "bootstrap" in c]
assert service_calls[:2] == [
["launchctl", "bootout", f"{domain}/{label}"],
["launchctl", "bootstrap", domain, str(plist_path)],
]
def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")