diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 75b24730123..a148f0b24cd 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -3000,24 +3000,36 @@ def get_launchd_label() -> str: def _launchd_domain() -> str: - return f"gui/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows + # The `user/` domain (vs the older `gui/`) is reachable from + # non-Aqua/background sessions (SSH, headless, login items) and is the only + # one that supports service management on macOS 26+. `gui/` returns + # error 125 ("Domain does not support specified action") there. See #23387. + return f"user/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows -# macOS 26+ broke launchctl management of the per-user GUI domain: `bootstrap` -# returns error 5 ("Input/output error") and `kickstart` returns error 125 -# ("Domain does not support specified action"). When launchd refuses to manage -# the gateway we can't supervise it as a service, so we fall back to a detached -# background process (the documented `nohup hermes gateway run` workaround). -# See issue #23387. +# On macOS, exit code 125 ("Domain does not support specified action") and +# 3/113 ("Could not find service") all mean the job isn't currently loaded in +# the target domain, so start/restart should re-bootstrap the plist and retry. +_LAUNCHD_JOB_UNLOADED_EXIT_CODES = frozenset({3, 113, 125}) + +# When even a fresh bootstrap can't manage the domain, launchctl returns 5 +# ("Input/output error") or a persistent 125. On those hosts launchd cannot +# supervise the gateway at all, so we degrade to a detached background process +# (the documented `nohup hermes gateway run` workaround). See #23387. _LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES = frozenset({5, 125}) -def _launchctl_domain_unsupported(returncode: int) -> bool: - """True when launchctl rejected the action because the domain can't manage it. +def _launchd_error_indicates_unloaded(exc: subprocess.CalledProcessError) -> bool: + """True when launchctl failed because the job isn't loaded (retry bootstrap).""" + return exc.returncode in _LAUNCHD_JOB_UNLOADED_EXIT_CODES - Codes 5 and 125 are emitted by macOS 26+ for `bootstrap`/`kickstart` against - the `gui/` (and `user/`) domains, which no longer support service - management. Treat these as "launchd unavailable" and degrade gracefully. + +def _launchctl_domain_unsupported(returncode: int) -> bool: + """True when launchctl can't manage the domain even after a fresh bootstrap. + + Codes 5 and 125 persist on macOS hosts where neither `gui/` nor + `user/` supports service management; treat these as "launchd + unavailable" and degrade gracefully to a detached process. """ return returncode in _LAUNCHCTL_DOMAIN_UNSUPPORTED_CODES @@ -3170,6 +3182,12 @@ def generate_launchd_plist() -> str: HERMES_HOME {hermes_home} + + LimitLoadToSessionType + + Aqua + Background + RunAtLoad @@ -3321,11 +3339,9 @@ def launchd_start(): timeout=30, ) except subprocess.CalledProcessError as e: - if _launchctl_domain_unsupported(e.returncode): - _launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}") - return - if e.returncode not in {3, 113}: + if not _launchd_error_indicates_unloaded(e): raise + # Job not loaded in this domain — re-bootstrap the plist and retry. print("↻ launchd job was unloaded; reloading service definition") try: subprocess.run( @@ -3339,6 +3355,8 @@ def launchd_start(): timeout=30, ) except subprocess.CalledProcessError as e2: + # Even a fresh bootstrap can't manage the domain on this host — + # degrade to a detached background process (issue #23387). if not _launchctl_domain_unsupported(e2.returncode): raise _launchd_fallback_to_detached(f"launchctl exit {e2.returncode}") @@ -3364,10 +3382,12 @@ def launchd_stop(): try: subprocess.run(["launchctl", "bootout", target], check=True, timeout=90) except subprocess.CalledProcessError as e: - # 3/113: job already unloaded. 5/125: macOS 26+ can't manage the domain - # (issue #23387) — the gateway is a detached fallback process, so just - # fall through to the PID-based kill below. - if e.returncode in {3, 113} or _launchctl_domain_unsupported(e.returncode): + # Job already unloaded (3/113/125), or the domain can't be managed at + # all (5/125, macOS 26+ detached-fallback process, issue #23387) — in + # both cases just fall through to the PID-based kill below. + if _launchd_error_indicates_unloaded(e) or _launchctl_domain_unsupported( + e.returncode + ): pass else: raise @@ -3452,13 +3472,13 @@ def launchd_restart(): subprocess.run(["launchctl", "kickstart", "-k", target], check=True, timeout=90) print("✓ Service restarted") except subprocess.CalledProcessError as e: - if _launchctl_domain_unsupported(e.returncode): - # macOS 26+ can't kickstart the domain (issue #23387). The old - # process was already drained/terminated above, so relaunch a - # fresh detached gateway. - _launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}") - return - if e.returncode not in {3, 113}: + if not _launchd_error_indicates_unloaded(e): + # Not a "job unloaded" code. If the domain is fundamentally + # unmanageable (error 5), degrade to detached; the old process was + # already drained/terminated above. Otherwise re-raise. + if _launchctl_domain_unsupported(e.returncode): + _launchd_fallback_to_detached(f"launchctl kickstart exit {e.returncode}") + return raise # Job not loaded — bootstrap and start fresh print("↻ launchd job was unloaded; reloading") diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 90ad6a1a155..18e89fa408d 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -679,17 +679,51 @@ class TestLaunchdServiceRecovery: assert "stale" in output.lower() assert "not loaded" in output.lower() + def test_launchd_domain_uses_user_domain(self): + # The user/ domain (not gui/) is the one reachable from + # non-Aqua/background sessions on macOS 26+ (issue #23387). + assert gateway_cli._launchd_domain() == f"user/{os.getuid()}" + def test_launchctl_domain_unsupported_recognizes_macos26_codes(self): - # macOS 26+ rejects gui/ management with these codes (issue #23387). + # Codes that persist after a fresh bootstrap → launchd truly unavailable. assert gateway_cli._launchctl_domain_unsupported(5) is True assert gateway_cli._launchctl_domain_unsupported(125) is True - # Codes that mean "job not loaded" are NOT domain-unsupported. assert gateway_cli._launchctl_domain_unsupported(3) is False assert gateway_cli._launchctl_domain_unsupported(113) is False assert gateway_cli._launchctl_domain_unsupported(0) is False - def test_launchd_start_falls_back_to_detached_on_kickstart_125(self, tmp_path, monkeypatch, capsys): - """macOS 26 kickstart error 125 should spawn a detached gateway, not crash.""" + def test_launchd_start_reloads_on_kickstart_exit_code_125(self, tmp_path, monkeypatch): + """Exit code 125 means the job is absent from the domain → bootstrap recovery.""" + plist_path = tmp_path / "ai.hermes.gateway.plist" + plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8") + label = gateway_cli.get_launchd_label() + + calls = [] + domain = gateway_cli._launchd_domain() + target = f"{domain}/{label}" + + def fake_run(cmd, check=False, **kwargs): + if cmd and cmd[0] == "launchctl": + calls.append(cmd) + if cmd == ["launchctl", "kickstart", target] and calls.count(cmd) == 1: + raise gateway_cli.subprocess.CalledProcessError( + 125, cmd, stderr="Domain does not support specified action" + ) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + gateway_cli.launchd_start() + + assert calls == [ + ["launchctl", "kickstart", target], + ["launchctl", "bootstrap", domain, str(plist_path)], + ["launchctl", "kickstart", target], + ] + + def test_launchd_start_falls_back_to_detached_when_rebootstrap_fails(self, tmp_path, monkeypatch, capsys): + """If even a fresh bootstrap can't manage the domain, spawn detached.""" plist_path = tmp_path / "ai.hermes.gateway.plist" plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8") label = gateway_cli.get_launchd_label() @@ -700,9 +734,15 @@ class TestLaunchdServiceRecovery: def fake_run(cmd, check=False, **kwargs): if cmd == ["launchctl", "kickstart", target]: + # First kickstart: job not loaded (125). After bootstrap also + # fails, this won't be reached again. raise gateway_cli.subprocess.CalledProcessError( 125, cmd, stderr="Domain does not support specified action" ) + if cmd[:2] == ["launchctl", "bootstrap"]: + raise gateway_cli.subprocess.CalledProcessError( + 5, cmd, stderr="Input/output error" + ) return SimpleNamespace(returncode=0, stdout="", stderr="") monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) @@ -715,11 +755,10 @@ class TestLaunchdServiceRecovery: gateway_cli.launchd_start() assert spawned == [True] - out = capsys.readouterr().out.lower() - assert "background process" in out + assert "background process" in capsys.readouterr().out.lower() def test_launchd_install_falls_back_to_detached_on_bootstrap_5(self, tmp_path, monkeypatch, capsys): - """macOS 26 bootstrap error 5 should spawn a detached gateway, not crash.""" + """macOS bootstrap error 5 should spawn a detached gateway, not crash.""" plist_path = tmp_path / "ai.hermes.gateway.plist" monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) @@ -742,8 +781,8 @@ class TestLaunchdServiceRecovery: assert spawned == [True] assert "Service installed and loaded" not in capsys.readouterr().out - def test_launchd_restart_falls_back_to_detached_on_kickstart_125(self, monkeypatch, capsys): - """When kickstart -k returns 125, restart should relaunch detached.""" + def test_launchd_restart_falls_back_to_detached_on_error_5(self, monkeypatch, capsys): + """kickstart -k error 5 (domain unmanageable) should relaunch detached.""" target = f"{gateway_cli._launchd_domain()}/{gateway_cli.get_launchd_label()}" monkeypatch.setattr(gateway_cli, "_get_restart_drain_timeout", lambda: 5.0) @@ -755,7 +794,7 @@ class TestLaunchdServiceRecovery: def fake_run(cmd, check=False, **kwargs): if cmd == ["launchctl", "kickstart", "-k", target]: raise gateway_cli.subprocess.CalledProcessError( - 125, cmd, stderr="Domain does not support specified action" + 5, cmd, stderr="Input/output error" ) return SimpleNamespace(returncode=0, stdout="", stderr="") @@ -1746,6 +1785,14 @@ class TestProfileArg: assert "--profile" in plist assert "mybot" in plist + def test_launchd_plist_supports_aqua_and_background_sessions(self): + # macOS 26+ only loads the agent in non-Aqua sessions when the plist + # opts into Background as well (issue #23387). + plist = gateway_cli.generate_launchd_plist() + assert "LimitLoadToSessionType" in plist + assert "Aqua" in plist + assert "Background" in plist + def test_launchd_plist_path_uses_real_user_home_not_profile_home(self, tmp_path, monkeypatch): profile_dir = tmp_path / ".hermes" / "profiles" / "orcha" profile_dir.mkdir(parents=True)