fix(gateway): use user launchd domain + Background session, detached fallback (macOS 26)

Salvages the primary fix from #24275 (asdlem) and layers a last-resort
fallback on top:

Primary (from #24275): the real macOS 26 root cause is that `gui/<uid>`
isn't reachable from non-Aqua/background sessions. Switch the launchd
domain to `user/<uid>` and mark the plist valid for both Aqua and
Background sessions (LimitLoadToSessionType), restoring a real supervised
service. Treat exit code 125 as "job unloaded" so start/restart
re-bootstrap and retry.

Last resort (this PR): the #23387 reporter saw `user/<uid>` bootstrap
also fail with error 5 on some hosts. When even a fresh bootstrap can't
manage the domain (codes 5/125 persist), degrade to a CLI-managed
detached background process instead of crashing — logs to gateway.log,
PID tracked via gateway.pid so stop/status/restart keep working. Print
guidance that it won't auto-start at login or auto-restart on crash.

Co-authored-by: asdlem <asdlem@users.noreply.github.com>
This commit is contained in:
Brooklyn Nicholson 2026-06-06 10:28:52 -05:00 committed by Teknium
parent 59c273ba3a
commit 3606307339
2 changed files with 104 additions and 37 deletions

View file

@ -3000,24 +3000,36 @@ def get_launchd_label() -> str:
def _launchd_domain() -> str:
return f"gui/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
# The `user/<uid>` domain (vs the older `gui/<uid>`) is reachable from
# non-Aqua/background sessions (SSH, headless, login items) and is the only
# one that supports service management on macOS 26+. `gui/<uid>` returns
# error 125 ("Domain does not support specified action") there. See #23387.
return f"user/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
# macOS 26+ broke launchctl management of the per-user GUI domain: `bootstrap`
# returns error 5 ("Input/output error") and `kickstart` returns error 125
# ("Domain does not support specified action"). When launchd refuses to manage
# the gateway we can't supervise it as a service, so we fall back to a detached
# background process (the documented `nohup hermes gateway run` workaround).
# See issue #23387.
# On macOS, exit code 125 ("Domain does not support specified action") and
# 3/113 ("Could not find service") all mean the job isn't currently loaded in
# the target domain, so start/restart should re-bootstrap the plist and retry.
_LAUNCHD_JOB_UNLOADED_EXIT_CODES = frozenset({3, 113, 125})
# When even a fresh bootstrap can't manage the domain, launchctl returns 5
# ("Input/output error") or a persistent 125. On those hosts launchd cannot
# supervise the gateway at all, so we degrade to a detached background process
# (the documented `nohup hermes gateway run` workaround). See #23387.
_LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES = frozenset({5, 125})
def _launchctl_domain_unsupported(returncode: int) -> bool:
"""True when launchctl rejected the action because the domain can't manage it.
def _launchd_error_indicates_unloaded(exc: subprocess.CalledProcessError) -> bool:
"""True when launchctl failed because the job isn't loaded (retry bootstrap)."""
return exc.returncode in _LAUNCHD_JOB_UNLOADED_EXIT_CODES
Codes 5 and 125 are emitted by macOS 26+ for `bootstrap`/`kickstart` against
the `gui/<uid>` (and `user/<uid>`) domains, which no longer support service
management. Treat these as "launchd unavailable" and degrade gracefully.
def _launchctl_domain_unsupported(returncode: int) -> bool:
"""True when launchctl can't manage the domain even after a fresh bootstrap.
Codes 5 and 125 persist on macOS hosts where neither `gui/<uid>` nor
`user/<uid>` supports service management; treat these as "launchd
unavailable" and degrade gracefully to a detached process.
"""
return returncode in _LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES
@ -3170,6 +3182,12 @@ def generate_launchd_plist() -> str:
<key>HERMES_HOME</key>
<string>{hermes_home}</string>
</dict>
<key>LimitLoadToSessionType</key>
<array>
<string>Aqua</string>
<string>Background</string>
</array>
<key>RunAtLoad</key>
<true/>
@ -3321,11 +3339,9 @@ def launchd_start():
timeout=30,
)
except subprocess.CalledProcessError as e:
if _launchctl_domain_unsupported(e.returncode):
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
return
if e.returncode not in {3, 113}:
if not _launchd_error_indicates_unloaded(e):
raise
# Job not loaded in this domain — re-bootstrap the plist and retry.
print("↻ launchd job was unloaded; reloading service definition")
try:
subprocess.run(
@ -3339,6 +3355,8 @@ def launchd_start():
timeout=30,
)
except subprocess.CalledProcessError as e2:
# Even a fresh bootstrap can't manage the domain on this host —
# degrade to a detached background process (issue #23387).
if not _launchctl_domain_unsupported(e2.returncode):
raise
_launchd_fallback_to_detached(f"launchctl exit {e2.returncode}")
@ -3364,10 +3382,12 @@ def launchd_stop():
try:
subprocess.run(["launchctl", "bootout", target], check=True, timeout=90)
except subprocess.CalledProcessError as e:
# 3/113: job already unloaded. 5/125: macOS 26+ can't manage the domain
# (issue #23387) — the gateway is a detached fallback process, so just
# fall through to the PID-based kill below.
if e.returncode in {3, 113} or _launchctl_domain_unsupported(e.returncode):
# Job already unloaded (3/113/125), or the domain can't be managed at
# all (5/125, macOS 26+ detached-fallback process, issue #23387) — in
# both cases just fall through to the PID-based kill below.
if _launchd_error_indicates_unloaded(e) or _launchctl_domain_unsupported(
e.returncode
):
pass
else:
raise
@ -3452,13 +3472,13 @@ def launchd_restart():
subprocess.run(["launchctl", "kickstart", "-k", target], check=True, timeout=90)
print("✓ Service restarted")
except subprocess.CalledProcessError as e:
if _launchctl_domain_unsupported(e.returncode):
# macOS 26+ can't kickstart the domain (issue #23387). The old
# process was already drained/terminated above, so relaunch a
# fresh detached gateway.
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
return
if e.returncode not in {3, 113}:
if not _launchd_error_indicates_unloaded(e):
# Not a "job unloaded" code. If the domain is fundamentally
# unmanageable (error 5), degrade to detached; the old process was
# already drained/terminated above. Otherwise re-raise.
if _launchctl_domain_unsupported(e.returncode):
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
return
raise
# Job not loaded — bootstrap and start fresh
print("↻ launchd job was unloaded; reloading")

View file

@ -679,17 +679,51 @@ class TestLaunchdServiceRecovery:
assert "stale" in output.lower()
assert "not loaded" in output.lower()
def test_launchd_domain_uses_user_domain(self):
# The user/<uid> domain (not gui/<uid>) is the one reachable from
# non-Aqua/background sessions on macOS 26+ (issue #23387).
assert gateway_cli._launchd_domain() == f"user/{os.getuid()}"
def test_launchctl_domain_unsupported_recognizes_macos26_codes(self):
# macOS 26+ rejects gui/<uid> management with these codes (issue #23387).
# Codes that persist after a fresh bootstrap → launchd truly unavailable.
assert gateway_cli._launchctl_domain_unsupported(5) is True
assert gateway_cli._launchctl_domain_unsupported(125) is True
# Codes that mean "job not loaded" are NOT domain-unsupported.
assert gateway_cli._launchctl_domain_unsupported(3) is False
assert gateway_cli._launchctl_domain_unsupported(113) is False
assert gateway_cli._launchctl_domain_unsupported(0) is False
def test_launchd_start_falls_back_to_detached_on_kickstart_125(self, tmp_path, monkeypatch, capsys):
"""macOS 26 kickstart error 125 should spawn a detached gateway, not crash."""
def test_launchd_start_reloads_on_kickstart_exit_code_125(self, tmp_path, monkeypatch):
"""Exit code 125 means the job is absent from the domain → bootstrap recovery."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
label = gateway_cli.get_launchd_label()
calls = []
domain = gateway_cli._launchd_domain()
target = f"{domain}/{label}"
def fake_run(cmd, check=False, **kwargs):
if cmd and cmd[0] == "launchctl":
calls.append(cmd)
if cmd == ["launchctl", "kickstart", target] and calls.count(cmd) == 1:
raise gateway_cli.subprocess.CalledProcessError(
125, cmd, stderr="Domain does not support specified action"
)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
gateway_cli.launchd_start()
assert calls == [
["launchctl", "kickstart", target],
["launchctl", "bootstrap", domain, str(plist_path)],
["launchctl", "kickstart", target],
]
def test_launchd_start_falls_back_to_detached_when_rebootstrap_fails(self, tmp_path, monkeypatch, capsys):
"""If even a fresh bootstrap can't manage the domain, spawn detached."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
label = gateway_cli.get_launchd_label()
@ -700,9 +734,15 @@ class TestLaunchdServiceRecovery:
def fake_run(cmd, check=False, **kwargs):
if cmd == ["launchctl", "kickstart", target]:
# First kickstart: job not loaded (125). After bootstrap also
# fails, this won't be reached again.
raise gateway_cli.subprocess.CalledProcessError(
125, cmd, stderr="Domain does not support specified action"
)
if cmd[:2] == ["launchctl", "bootstrap"]:
raise gateway_cli.subprocess.CalledProcessError(
5, cmd, stderr="Input/output error"
)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
@ -715,11 +755,10 @@ class TestLaunchdServiceRecovery:
gateway_cli.launchd_start()
assert spawned == [True]
out = capsys.readouterr().out.lower()
assert "background process" in out
assert "background process" in capsys.readouterr().out.lower()
def test_launchd_install_falls_back_to_detached_on_bootstrap_5(self, tmp_path, monkeypatch, capsys):
"""macOS 26 bootstrap error 5 should spawn a detached gateway, not crash."""
"""macOS bootstrap error 5 should spawn a detached gateway, not crash."""
plist_path = tmp_path / "ai.hermes.gateway.plist"
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
@ -742,8 +781,8 @@ class TestLaunchdServiceRecovery:
assert spawned == [True]
assert "Service installed and loaded" not in capsys.readouterr().out
def test_launchd_restart_falls_back_to_detached_on_kickstart_125(self, monkeypatch, capsys):
"""When kickstart -k returns 125, restart should relaunch detached."""
def test_launchd_restart_falls_back_to_detached_on_error_5(self, monkeypatch, capsys):
"""kickstart -k error 5 (domain unmanageable) should relaunch detached."""
target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}"
monkeypatch.setattr(gateway_cli, "_get_restart_drain_timeout", lambda: 5.0)
@ -755,7 +794,7 @@ class TestLaunchdServiceRecovery:
def fake_run(cmd, check=False, **kwargs):
if cmd == ["launchctl", "kickstart", "-k", target]:
raise gateway_cli.subprocess.CalledProcessError(
125, cmd, stderr="Domain does not support specified action"
5, cmd, stderr="Input/output error"
)
return SimpleNamespace(returncode=0, stdout="", stderr="")
@ -1746,6 +1785,14 @@ class TestProfileArg:
assert "<string>--profile</string>" in plist
assert "<string>mybot</string>" in plist
def test_launchd_plist_supports_aqua_and_background_sessions(self):
# macOS 26+ only loads the agent in non-Aqua sessions when the plist
# opts into Background as well (issue #23387).
plist = gateway_cli.generate_launchd_plist()
assert "<key>LimitLoadToSessionType</key>" in plist
assert "<string>Aqua</string>" in plist
assert "<string>Background</string>" in plist
def test_launchd_plist_path_uses_real_user_home_not_profile_home(self, tmp_path, monkeypatch):
profile_dir = tmp_path / ".hermes" / "profiles" / "orcha"
profile_dir.mkdir(parents=True)