diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index d5b76574a12..7e5406a11dd 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -7,6 +7,7 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup] import asyncio import logging import os +import shlex import shutil import signal import subprocess @@ -3467,14 +3468,60 @@ def refresh_launchd_plist_if_needed() -> bool: plist_path.write_text(new_plist, encoding="utf-8") label = get_launchd_label() + domain = _launchd_domain() + target = f"{domain}/{label}" + + # If this refresh is running INSIDE the gateway's own launchd process tree + # (e.g. the agent triggered a self-update via its terminal tool), a direct + # `launchctl bootout` tears down the service's process group — which + # includes THIS CLI — before the follow-up `bootstrap` can run. The gateway + # then stays unloaded and KeepAlive can't revive it (#43842). Detect that + # case and hand the reload to a detached session that survives the bootout. + gateway_pid = None + try: + from gateway.status import get_running_pid + gateway_pid = get_running_pid() + except Exception: + gateway_pid = None + + if ( + gateway_pid is not None + and _is_pid_ancestor_of_current_process(gateway_pid) + and hasattr(os, "setsid") # POSIX-only; launchd is macOS so always true here + ): + # Delegate to a new session: `start_new_session=True` detaches the + # helper from the gateway's process group, so the bootout that kills + # the gateway (and us) does not kill the helper before it bootstraps. + reload_script = ( + f"sleep 2; " + f"launchctl bootout {shlex.quote(target)} 2>/dev/null; " + f"sleep 1; " + f"launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null" + ) + try: + subprocess.Popen( + ["/bin/bash", "-c", reload_script], + start_new_session=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + except Exception as e: + logger.warning("Deferred launchd reload could not be spawned: %s", e) + return False + print( + "↻ Updated gateway launchd service definition; reload deferred to a " + "detached helper (refresh ran inside the gateway process tree)" + ) + return True + # Bootout/bootstrap so launchd picks up the new definition subprocess.run( - ["launchctl", "bootout", f"{_launchd_domain()}/{label}"], + ["launchctl", "bootout", target], check=False, timeout=90, ) subprocess.run( - ["launchctl", "bootstrap", _launchd_domain(), str(plist_path)], + ["launchctl", "bootstrap", domain, str(plist_path)], check=False, timeout=30, ) diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index f9cdcc1f313..6dd504a5f70 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -599,6 +599,8 @@ class TestLaunchdServiceRecovery: return SimpleNamespace(returncode=0, stdout="", stderr="") monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + # Not running inside the gateway tree → direct bootout/bootstrap path. + monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: None) gateway_cli.launchd_install() @@ -613,6 +615,110 @@ class TestLaunchdServiceRecovery: ["launchctl", "bootstrap", domain, str(plist_path)], ] + def test_refresh_defers_reload_when_running_inside_gateway_tree(self, tmp_path, monkeypatch): + """#43842: when the refresh runs inside the gateway's own process tree, + a direct bootout would kill this CLI before bootstrap. The reload must + be delegated to a detached helper instead.""" + plist_path = tmp_path / "ai.hermes.gateway.plist" + plist_path.write_text("old content", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False) + monkeypatch.setattr( + gateway_cli, + "generate_launchd_plist", + lambda: ( + "--replace\nHERMES_HOME" + "/Users/alice/.hermes" + ), + ) + # Pretend the gateway is running and that we ARE inside its tree. + monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242) + monkeypatch.setattr( + gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: pid == 4242 + ) + + run_calls = [] + + def fake_run(cmd, check=False, **kwargs): + run_calls.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + popen_calls = [] + + def fake_popen(cmd, **kwargs): + popen_calls.append((cmd, kwargs)) + return SimpleNamespace(pid=9999) + + monkeypatch.setattr(gateway_cli.subprocess, "Popen", fake_popen) + + result = gateway_cli.refresh_launchd_plist_if_needed() + + assert result is True + # The new plist was written. + assert "--replace" in plist_path.read_text(encoding="utf-8") + # No DIRECT bootout/bootstrap ran (those would kill us mid-sequence). + assert not [c for c in run_calls if "bootout" in c or "bootstrap" in c] + # Exactly one detached helper was spawned, in a new session, and it + # performs both bootout and bootstrap. + assert len(popen_calls) == 1 + cmd, kwargs = popen_calls[0] + assert kwargs.get("start_new_session") is True + script = cmd[-1] + assert "bootout" in script and "bootstrap" in script + assert str(plist_path) in script + + def test_refresh_uses_direct_reload_when_not_inside_gateway_tree(self, tmp_path, monkeypatch): + """Normal CLI-initiated refresh (outside the service tree) keeps the + direct synchronous bootout/bootstrap path.""" + plist_path = tmp_path / "ai.hermes.gateway.plist" + plist_path.write_text("old content", encoding="utf-8") + + monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path) + monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False) + monkeypatch.setattr( + gateway_cli, + "generate_launchd_plist", + lambda: ( + "--replace\nHERMES_HOME" + "/Users/alice/.hermes" + ), + ) + # Gateway running, but we are NOT inside its tree. + monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242) + monkeypatch.setattr( + gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: False + ) + + run_calls = [] + + def fake_run(cmd, check=False, **kwargs): + run_calls.append(cmd) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run) + + popen_calls = [] + monkeypatch.setattr( + gateway_cli.subprocess, "Popen", + lambda cmd, **kw: popen_calls.append(cmd) or SimpleNamespace(pid=1), + ) + + result = gateway_cli.refresh_launchd_plist_if_needed() + + assert result is True + # No detached helper — direct path taken. + assert not popen_calls + label = gateway_cli.get_launchd_label() + domain = gateway_cli._launchd_domain() + service_calls = [c for c in run_calls if "bootout" in c or "bootstrap" in c] + assert service_calls[:2] == [ + ["launchctl", "bootout", f"{domain}/{label}"], + ["launchctl", "bootstrap", domain, str(plist_path)], + ] + def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch): plist_path = tmp_path / "ai.hermes.gateway.plist" plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")