mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-18 09:51:59 +00:00
fix(gateway): defer macOS launchd reload when run inside the gateway tree (#47842)
When refresh_launchd_plist_if_needed() runs from inside the gateway's own launchd process tree (agent-initiated self-update via the terminal tool), a direct launchctl bootout tears down the service's process group — including the CLI doing the refresh — before the follow-up bootstrap can run. The gateway is left unloaded and KeepAlive can't revive it (#43842). Detect in-service execution via gateway.status.get_running_pid() + _is_pid_ancestor_of_current_process(), and delegate the bootout->bootstrap to a detached (start_new_session=True) helper that survives the process-group teardown. The normal out-of-tree CLI path is unchanged. Fixes #43842.
This commit is contained in:
parent
4d39a603d1
commit
e48803daec
2 changed files with 155 additions and 2 deletions
|
|
@ -7,6 +7,7 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup]
|
|||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
|
|
@ -3467,14 +3468,60 @@ def refresh_launchd_plist_if_needed() -> bool:
|
|||
|
||||
plist_path.write_text(new_plist, encoding="utf-8")
|
||||
label = get_launchd_label()
|
||||
domain = _launchd_domain()
|
||||
target = f"{domain}/{label}"
|
||||
|
||||
# If this refresh is running INSIDE the gateway's own launchd process tree
|
||||
# (e.g. the agent triggered a self-update via its terminal tool), a direct
|
||||
# `launchctl bootout` tears down the service's process group — which
|
||||
# includes THIS CLI — before the follow-up `bootstrap` can run. The gateway
|
||||
# then stays unloaded and KeepAlive can't revive it (#43842). Detect that
|
||||
# case and hand the reload to a detached session that survives the bootout.
|
||||
gateway_pid = None
|
||||
try:
|
||||
from gateway.status import get_running_pid
|
||||
gateway_pid = get_running_pid()
|
||||
except Exception:
|
||||
gateway_pid = None
|
||||
|
||||
if (
|
||||
gateway_pid is not None
|
||||
and _is_pid_ancestor_of_current_process(gateway_pid)
|
||||
and hasattr(os, "setsid") # POSIX-only; launchd is macOS so always true here
|
||||
):
|
||||
# Delegate to a new session: `start_new_session=True` detaches the
|
||||
# helper from the gateway's process group, so the bootout that kills
|
||||
# the gateway (and us) does not kill the helper before it bootstraps.
|
||||
reload_script = (
|
||||
f"sleep 2; "
|
||||
f"launchctl bootout {shlex.quote(target)} 2>/dev/null; "
|
||||
f"sleep 1; "
|
||||
f"launchctl bootstrap {shlex.quote(domain)} {shlex.quote(str(plist_path))} 2>/dev/null"
|
||||
)
|
||||
try:
|
||||
subprocess.Popen(
|
||||
["/bin/bash", "-c", reload_script],
|
||||
start_new_session=True,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Deferred launchd reload could not be spawned: %s", e)
|
||||
return False
|
||||
print(
|
||||
"↻ Updated gateway launchd service definition; reload deferred to a "
|
||||
"detached helper (refresh ran inside the gateway process tree)"
|
||||
)
|
||||
return True
|
||||
|
||||
# Bootout/bootstrap so launchd picks up the new definition
|
||||
subprocess.run(
|
||||
["launchctl", "bootout", f"{_launchd_domain()}/{label}"],
|
||||
["launchctl", "bootout", target],
|
||||
check=False,
|
||||
timeout=90,
|
||||
)
|
||||
subprocess.run(
|
||||
["launchctl", "bootstrap", _launchd_domain(), str(plist_path)],
|
||||
["launchctl", "bootstrap", domain, str(plist_path)],
|
||||
check=False,
|
||||
timeout=30,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -599,6 +599,8 @@ class TestLaunchdServiceRecovery:
|
|||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
# Not running inside the gateway tree → direct bootout/bootstrap path.
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: None)
|
||||
|
||||
gateway_cli.launchd_install()
|
||||
|
||||
|
|
@ -613,6 +615,110 @@ class TestLaunchdServiceRecovery:
|
|||
["launchctl", "bootstrap", domain, str(plist_path)],
|
||||
]
|
||||
|
||||
def test_refresh_defers_reload_when_running_inside_gateway_tree(self, tmp_path, monkeypatch):
|
||||
"""#43842: when the refresh runs inside the gateway's own process tree,
|
||||
a direct bootout would kill this CLI before bootstrap. The reload must
|
||||
be delegated to a detached helper instead."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"generate_launchd_plist",
|
||||
lambda: (
|
||||
"<plist>--replace\n<key>HERMES_HOME</key>"
|
||||
"<string>/Users/alice/.hermes</string></plist>"
|
||||
),
|
||||
)
|
||||
# Pretend the gateway is running and that we ARE inside its tree.
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: pid == 4242
|
||||
)
|
||||
|
||||
run_calls = []
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
run_calls.append(cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
popen_calls = []
|
||||
|
||||
def fake_popen(cmd, **kwargs):
|
||||
popen_calls.append((cmd, kwargs))
|
||||
return SimpleNamespace(pid=9999)
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "Popen", fake_popen)
|
||||
|
||||
result = gateway_cli.refresh_launchd_plist_if_needed()
|
||||
|
||||
assert result is True
|
||||
# The new plist was written.
|
||||
assert "--replace" in plist_path.read_text(encoding="utf-8")
|
||||
# No DIRECT bootout/bootstrap ran (those would kill us mid-sequence).
|
||||
assert not [c for c in run_calls if "bootout" in c or "bootstrap" in c]
|
||||
# Exactly one detached helper was spawned, in a new session, and it
|
||||
# performs both bootout and bootstrap.
|
||||
assert len(popen_calls) == 1
|
||||
cmd, kwargs = popen_calls[0]
|
||||
assert kwargs.get("start_new_session") is True
|
||||
script = cmd[-1]
|
||||
assert "bootout" in script and "bootstrap" in script
|
||||
assert str(plist_path) in script
|
||||
|
||||
def test_refresh_uses_direct_reload_when_not_inside_gateway_tree(self, tmp_path, monkeypatch):
|
||||
"""Normal CLI-initiated refresh (outside the service tree) keeps the
|
||||
direct synchronous bootout/bootstrap path."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
monkeypatch.setattr(gateway_cli, "launchd_plist_is_current", lambda: False)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli,
|
||||
"generate_launchd_plist",
|
||||
lambda: (
|
||||
"<plist>--replace\n<key>HERMES_HOME</key>"
|
||||
"<string>/Users/alice/.hermes</string></plist>"
|
||||
),
|
||||
)
|
||||
# Gateway running, but we are NOT inside its tree.
|
||||
monkeypatch.setattr("gateway.status.get_running_pid", lambda *a, **k: 4242)
|
||||
monkeypatch.setattr(
|
||||
gateway_cli, "_is_pid_ancestor_of_current_process", lambda pid: False
|
||||
)
|
||||
|
||||
run_calls = []
|
||||
|
||||
def fake_run(cmd, check=False, **kwargs):
|
||||
run_calls.append(cmd)
|
||||
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
popen_calls = []
|
||||
monkeypatch.setattr(
|
||||
gateway_cli.subprocess, "Popen",
|
||||
lambda cmd, **kw: popen_calls.append(cmd) or SimpleNamespace(pid=1),
|
||||
)
|
||||
|
||||
result = gateway_cli.refresh_launchd_plist_if_needed()
|
||||
|
||||
assert result is True
|
||||
# No detached helper — direct path taken.
|
||||
assert not popen_calls
|
||||
label = gateway_cli.get_launchd_label()
|
||||
domain = gateway_cli._launchd_domain()
|
||||
service_calls = [c for c in run_calls if "bootout" in c or "bootstrap" in c]
|
||||
assert service_calls[:2] == [
|
||||
["launchctl", "bootout", f"{domain}/{label}"],
|
||||
["launchctl", "bootstrap", domain, str(plist_path)],
|
||||
]
|
||||
|
||||
def test_launchd_start_reloads_unloaded_job_and_retries(self, tmp_path, monkeypatch):
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text(gateway_cli.generate_launchd_plist(), encoding="utf-8")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue