fix(gateway): probe launchd domain instead of hardcoding user/<uid> (#40831)

The previous fix for #23387 changed _launchd_domain() from gui/<uid> to
user/<uid> to support Background/SSH sessions on macOS 26+. However, this
broke Aqua sessions where gui/<uid> is the only working domain and
user/<uid> cannot bootstrap or manage the service.

Now _launchd_domain() probes which domain actually contains the loaded
service:
1. Try gui/<uid> first (Aqua sessions)
2. Fall back to user/<uid> (Background/SSH sessions)
3. Use launchctl managername as heuristic when neither has the service
4. Cache the result for the process lifetime

Regression tests cover all four paths plus caching behavior.
This commit is contained in:
Tranquil-Flow 2026-06-07 02:09:04 +02:00 committed by Teknium
parent 2d75833abe
commit a8f404b29f
2 changed files with 196 additions and 8 deletions

View file

@ -3067,12 +3067,77 @@ def get_launchd_label() -> str:
return f"ai.hermes.gateway-{suffix}" if suffix else "ai.hermes.gateway"
# Cached launchd domain result — probing is cheap but should only run once per
# process invocation (each ``hermes gateway start/stop/status`` call).
_resolved_launchd_domain: str | None = None
def _launchd_domain() -> str:
# The `user/<uid>` domain (vs the older `gui/<uid>`) is reachable from
# non-Aqua/background sessions (SSH, headless, login items) and is the only
# one that supports service management on macOS 26+. `gui/<uid>` returns
# error 125 ("Domain does not support specified action") there. See #23387.
return f"user/{os.getuid()}" # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
"""Return the launchd domain that actually manages the gateway service.
Probes ``gui/<uid>`` first (Aqua sessions), then ``user/<uid>``
(Background/SSH sessions). When neither domain contains a loaded
service, falls back to ``launchctl managername`` as a heuristic.
The result is cached for the lifetime of the process so that repeated
calls (``start``, ``stop``, ``restart``) use a consistent domain.
See #40831, #23387.
"""
global _resolved_launchd_domain
if _resolved_launchd_domain is not None:
return _resolved_launchd_domain
uid = os.getuid() # windows-footgun: ok — POSIX launchd (macOS) helper, never invoked on Windows
label = get_launchd_label()
gui_domain = f"gui/{uid}"
user_domain = f"user/{uid}"
# 1. Probe gui/<uid> first — in Aqua sessions the service is loaded here.
try:
subprocess.run(
["launchctl", "print", f"{gui_domain}/{label}"],
check=True,
timeout=5,
capture_output=True,
)
_resolved_launchd_domain = gui_domain
return gui_domain
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
pass
# 2. Probe user/<uid> — in Background/SSH sessions this is the working domain.
try:
subprocess.run(
["launchctl", "print", f"{user_domain}/{label}"],
check=True,
timeout=5,
capture_output=True,
)
_resolved_launchd_domain = user_domain
return user_domain
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
pass
# 3. Neither domain has the service loaded — use managername as heuristic.
# Aqua → gui/<uid>, anything else (Background, loginwindow) → user/<uid>.
try:
result = subprocess.run(
["launchctl", "managername"],
capture_output=True,
text=True,
timeout=5,
)
if "Aqua" in (result.stdout or ""):
_resolved_launchd_domain = gui_domain
return gui_domain
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
pass
# 4. Default to user/<uid> (matches the pre-probing behavior for
# Background/SSH sessions and is the recommended domain on macOS 26+).
_resolved_launchd_domain = user_domain
return user_domain
# On macOS, exit code 125 ("Domain does not support specified action") and