mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(gateway): use user launchd domain + Background session, detached fallback (macOS 26)
Salvages the primary fix from #24275 (asdlem) and layers a last-resort fallback on top: Primary (from #24275): the real macOS 26 root cause is that `gui/<uid>` isn't reachable from non-Aqua/background sessions. Switch the launchd domain to `user/<uid>` and mark the plist valid for both Aqua and Background sessions (LimitLoadToSessionType), restoring a real supervised service. Treat exit code 125 as "job unloaded" so start/restart re-bootstrap and retry. Last resort (this PR): the #23387 reporter saw `user/<uid>` bootstrap also fail with error 5 on some hosts. When even a fresh bootstrap can't manage the domain (codes 5/125 persist), degrade to a CLI-managed detached background process instead of crashing — logs to gateway.log, PID tracked via gateway.pid so stop/status/restart keep working. Print guidance that it won't auto-start at login or auto-restart on crash. Co-authored-by: asdlem <asdlem@users.noreply.github.com>
This commit is contained in:
parent
59c273ba3a
commit
3606307339
2 changed files with 104 additions and 37 deletions
|
|
@ -3000,24 +3000,36 @@ def get_launchd_label() -> str:
|
|||
|
||||
|
||||
def _launchd_domain() -> str:
|
||||
return f"gui/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
|
||||
# The `user/<uid>` domain (vs the older `gui/<uid>`) is reachable from
|
||||
# non-Aqua/background sessions (SSH, headless, login items) and is the only
|
||||
# one that supports service management on macOS 26+. `gui/<uid>` returns
|
||||
# error 125 ("Domain does not support specified action") there. See #23387.
|
||||
return f"user/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
|
||||
|
||||
|
||||
# macOS 26+ broke launchctl management of the per-user GUI domain: `bootstrap`
|
||||
# returns error 5 ("Input/output error") and `kickstart` returns error 125
|
||||
# ("Domain does not support specified action"). When launchd refuses to manage
|
||||
# the gateway we can't supervise it as a service, so we fall back to a detached
|
||||
# background process (the documented `nohup hermes gateway run` workaround).
|
||||
# See issue #23387.
|
||||
# On macOS, exit code 125 ("Domain does not support specified action") and
|
||||
# 3/113 ("Could not find service") all mean the job isn't currently loaded in
|
||||
# the target domain, so start/restart should re-bootstrap the plist and retry.
|
||||
_LAUNCHD_JOB_UNLOADED_EXIT_CODES = frozenset({3, 113, 125})
|
||||
|
||||
# When even a fresh bootstrap can't manage the domain, launchctl returns 5
|
||||
# ("Input/output error") or a persistent 125. On those hosts launchd cannot
|
||||
# supervise the gateway at all, so we degrade to a detached background process
|
||||
# (the documented `nohup hermes gateway run` workaround). See #23387.
|
||||
_LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES = frozenset({5, 125})
|
||||
|
||||
|
||||
def _launchctl_domain_unsupported(returncode: int) -> bool:
|
||||
"""True when launchctl rejected the action because the domain can't manage it.
|
||||
def _launchd_error_indicates_unloaded(exc: subprocess.CalledProcessError) -> bool:
|
||||
"""True when launchctl failed because the job isn't loaded (retry bootstrap)."""
|
||||
return exc.returncode in _LAUNCHD_JOB_UNLOADED_EXIT_CODES
|
||||
|
||||
Codes 5 and 125 are emitted by macOS 26+ for `bootstrap`/`kickstart` against
|
||||
the `gui/<uid>` (and `user/<uid>`) domains, which no longer support service
|
||||
management. Treat these as "launchd unavailable" and degrade gracefully.
|
||||
|
||||
def _launchctl_domain_unsupported(returncode: int) -> bool:
|
||||
"""True when launchctl can't manage the domain even after a fresh bootstrap.
|
||||
|
||||
Codes 5 and 125 persist on macOS hosts where neither `gui/<uid>` nor
|
||||
`user/<uid>` supports service management; treat these as "launchd
|
||||
unavailable" and degrade gracefully to a detached process.
|
||||
"""
|
||||
return returncode in _LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES
|
||||
|
||||
|
|
@ -3170,6 +3182,12 @@ def generate_launchd_plist() -> str:
|
|||
<key>HERMES_HOME</key>
|
||||
<string>{hermes_home}</string>
|
||||
</dict>
|
||||
|
||||
<key>LimitLoadToSessionType</key>
|
||||
<array>
|
||||
<string>Aqua</string>
|
||||
<string>Background</string>
|
||||
</array>
|
||||
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
|
|
@ -3321,11 +3339,9 @@ def launchd_start():
|
|||
timeout=30,
|
||||
)
|
||||
except subprocess.CalledProcessError as e:
|
||||
if _launchctl_domain_unsupported(e.returncode):
|
||||
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
|
||||
return
|
||||
if e.returncode not in {3, 113}:
|
||||
if not _launchd_error_indicates_unloaded(e):
|
||||
raise
|
||||
# Job not loaded in this domain — re-bootstrap the plist and retry.
|
||||
print("↻ launchd job was unloaded; reloading service definition")
|
||||
try:
|
||||
subprocess.run(
|
||||
|
|
@ -3339,6 +3355,8 @@ def launchd_start():
|
|||
timeout=30,
|
||||
)
|
||||
except subprocess.CalledProcessError as e2:
|
||||
# Even a fresh bootstrap can't manage the domain on this host —
|
||||
# degrade to a detached background process (issue #23387).
|
||||
if not _launchctl_domain_unsupported(e2.returncode):
|
||||
raise
|
||||
_launchd_fallback_to_detached(f"launchctl exit {e2.returncode}")
|
||||
|
|
@ -3364,10 +3382,12 @@ def launchd_stop():
|
|||
try:
|
||||
subprocess.run(["launchctl", "bootout", target], check=True, timeout=90)
|
||||
except subprocess.CalledProcessError as e:
|
||||
# 3/113: job already unloaded. 5/125: macOS 26+ can't manage the domain
|
||||
# (issue #23387) — the gateway is a detached fallback process, so just
|
||||
# fall through to the PID-based kill below.
|
||||
if e.returncode in {3, 113} or _launchctl_domain_unsupported(e.returncode):
|
||||
# Job already unloaded (3/113/125), or the domain can't be managed at
|
||||
# all (5/125, macOS 26+ detached-fallback process, issue #23387) — in
|
||||
# both cases just fall through to the PID-based kill below.
|
||||
if _launchd_error_indicates_unloaded(e) or _launchctl_domain_unsupported(
|
||||
e.returncode
|
||||
):
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
|
@ -3452,13 +3472,13 @@ def launchd_restart():
|
|||
subprocess.run(["launchctl", "kickstart", "-k", target], check=True, timeout=90)
|
||||
print("✓ Service restarted")
|
||||
except subprocess.CalledProcessError as e:
|
||||
if _launchctl_domain_unsupported(e.returncode):
|
||||
# macOS 26+ can't kickstart the domain (issue #23387). The old
|
||||
# process was already drained/terminated above, so relaunch a
|
||||
# fresh detached gateway.
|
||||
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
|
||||
return
|
||||
if e.returncode not in {3, 113}:
|
||||
if not _launchd_error_indicates_unloaded(e):
|
||||
# Not a "job unloaded" code. If the domain is fundamentally
|
||||
# unmanageable (error 5), degrade to detached; the old process was
|
||||
# already drained/terminated above. Otherwise re-raise.
|
||||
if _launchctl_domain_unsupported(e.returncode):
|
||||
_launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}")
|
||||
return
|
||||
raise
|
||||
# Job not loaded — bootstrap and start fresh
|
||||
print("↻ launchd job was unloaded; reloading")
|
||||
|
|
|
|||
|
|
@ -679,17 +679,51 @@ class TestLaunchdServiceRecovery:
|
|||
assert "stale" in output.lower()
|
||||
assert "not loaded" in output.lower()
|
||||
|
||||
def test_launchd_domain_uses_user_domain(self):
|
||||
# The user/<uid> domain (not gui/<uid>) is the one reachable from
|
||||
# non-Aqua/background sessions on macOS 26+ (issue #23387).
|
||||
assert gateway_cli._launchd_domain() == f"user/{os.getuid()}"
|
||||
|
||||
def test_launchctl_domain_unsupported_recognizes_macos26_codes(self):
|
||||
# macOS 26+ rejects gui/<uid> management with these codes (issue #23387).
|
||||
# Codes that persist after a fresh bootstrap → launchd truly unavailable.
|
||||
assert gateway_cli._launchctl_domain_unsupported(5) is True
|
||||
assert gateway_cli._launchctl_domain_unsupported(125) is True
|
||||
# Codes that mean "job not loaded" are NOT domain-unsupported.
|
||||
assert gateway_cli._launchctl_domain_unsupported(3) is False
|
||||
assert gateway_cli._launchctl_domain_unsupported(113) is False
|
||||
assert gateway_cli._launchctl_domain_unsupported(0) is False
|
||||
|
||||
def test_launchd_start_falls_back_to_detached_on_kickstart_125(self, tmp_path, monkeypatch, capsys):
|
||||
"""macOS 26 kickstart error 125 should spawn a detached gateway, not crash."""
|
||||
def test_launchd_start_reloads_on_kickstart_exit_code_125(self, tmp_path, monkeypatch):
|
||||
"""Exit code 125 means the job is absent from the domain → bootstrap recovery."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
|
||||
label = gateway_cli.get_launchd_label()
|
||||
|
||||
calls = []
|
||||
domain = gateway_cli._launchd_domain()
|
||||
target = f"{domain}/{label}"
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd and cmd[0] == "launchctl":
|
||||
calls.append(cmd)
|
||||
if cmd == ["launchctl", "kickstart", target] and calls.count(cmd) == 1:
|
||||
raise gateway_cli.subprocess.CalledProcessError(
|
||||
125, cmd, stderr="Domain does not support specified action"
|
||||
)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
gateway_cli.launchd_start()
|
||||
|
||||
assert calls == [
|
||||
["launchctl", "kickstart", target],
|
||||
["launchctl", "bootstrap", domain, str(plist_path)],
|
||||
["launchctl", "kickstart", target],
|
||||
]
|
||||
|
||||
def test_launchd_start_falls_back_to_detached_when_rebootstrap_fails(self, tmp_path, monkeypatch, capsys):
|
||||
"""If even a fresh bootstrap can't manage the domain, spawn detached."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
|
||||
label = gateway_cli.get_launchd_label()
|
||||
|
|
@ -700,9 +734,15 @@ class TestLaunchdServiceRecovery:
|
|||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd == ["launchctl", "kickstart", target]:
|
||||
# First kickstart: job not loaded (125). After bootstrap also
|
||||
# fails, this won't be reached again.
|
||||
raise gateway_cli.subprocess.CalledProcessError(
|
||||
125, cmd, stderr="Domain does not support specified action"
|
||||
)
|
||||
if cmd[:2] == ["launchctl", "bootstrap"]:
|
||||
raise gateway_cli.subprocess.CalledProcessError(
|
||||
5, cmd, stderr="Input/output error"
|
||||
)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
|
@ -715,11 +755,10 @@ class TestLaunchdServiceRecovery:
|
|||
gateway_cli.launchd_start()
|
||||
|
||||
assert spawned == [True]
|
||||
out = capsys.readouterr().out.lower()
|
||||
assert "background process" in out
|
||||
assert "background process" in capsys.readouterr().out.lower()
|
||||
|
||||
def test_launchd_install_falls_back_to_detached_on_bootstrap_5(self, tmp_path, monkeypatch, capsys):
|
||||
"""macOS 26 bootstrap error 5 should spawn a detached gateway, not crash."""
|
||||
"""macOS bootstrap error 5 should spawn a detached gateway, not crash."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
|
||||
|
|
@ -742,8 +781,8 @@ class TestLaunchdServiceRecovery:
|
|||
assert spawned == [True]
|
||||
assert "Service installed and loaded" not in capsys.readouterr().out
|
||||
|
||||
def test_launchd_restart_falls_back_to_detached_on_kickstart_125(self, monkeypatch, capsys):
|
||||
"""When kickstart -k returns 125, restart should relaunch detached."""
|
||||
def test_launchd_restart_falls_back_to_detached_on_error_5(self, monkeypatch, capsys):
|
||||
"""kickstart -k error 5 (domain unmanageable) should relaunch detached."""
|
||||
target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}"
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "_get_restart_drain_timeout", lambda: 5.0)
|
||||
|
|
@ -755,7 +794,7 @@ class TestLaunchdServiceRecovery:
|
|||
def fake_run(cmd, check=False, **kwargs):
|
||||
if cmd == ["launchctl", "kickstart", "-k", target]:
|
||||
raise gateway_cli.subprocess.CalledProcessError(
|
||||
125, cmd, stderr="Domain does not support specified action"
|
||||
5, cmd, stderr="Input/output error"
|
||||
)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
|
|
@ -1746,6 +1785,14 @@ class TestProfileArg:
|
|||
assert "<string>--profile</string>" in plist
|
||||
assert "<string>mybot</string>" in plist
|
||||
|
||||
def test_launchd_plist_supports_aqua_and_background_sessions(self):
|
||||
# macOS 26+ only loads the agent in non-Aqua sessions when the plist
|
||||
# opts into Background as well (issue #23387).
|
||||
plist = gateway_cli.generate_launchd_plist()
|
||||
assert "<key>LimitLoadToSessionType</key>" in plist
|
||||
assert "<string>Aqua</string>" in plist
|
||||
assert "<string>Background</string>" in plist
|
||||
|
||||
def test_launchd_plist_path_uses_real_user_home_not_profile_home(self, tmp_path, monkeypatch):
|
||||
profile_dir = tmp_path / ".hermes" / "profiles" / "orcha"
|
||||
profile_dir.mkdir(parents=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue