fix(gateway): launchd_stop uses bootout so KeepAlive doesn't respawn (#7119)

launchd_stop() previously used `launchctl kill SIGTERM` which only
signals the process. Because the plist has KeepAlive.SuccessfulExit=false,
launchd immediately respawns the gateway — making `hermes gateway stop`
a no-op that prints '✓ Service stopped' while the service keeps running.

Switch to `launchctl bootout` which unloads the service definition so
KeepAlive can't trigger. The process exits and stays stopped until
`hermes gateway start` (which already handles re-bootstrapping unloaded
jobs via error codes 3/113).

Also adds _wait_for_gateway_exit() after bootout to ensure the process
is fully gone before returning, and tolerates 'already unloaded' errors.

Fixes: .env changes not taking effect after gateway stop+restart on macOS.
The root cause was that stop didn't actually stop — the respawned process
loaded the old env before the user's restart command ran.
This commit is contained in:
Teknium 2026-04-10 03:45:34 -07:00 committed by GitHub
parent 957485876b
commit 437feabb74
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 70 additions and 1 deletions

View file

@ -1196,7 +1196,19 @@ def launchd_start():
def launchd_stop():
label = get_launchd_label()
subprocess.run(["launchctl", "kill", "SIGTERM", f"{_launchd_domain()}/{label}"], check=True, timeout=30)
target = f"{_launchd_domain()}/{label}"
# bootout unloads the service definition so KeepAlive doesn't respawn
# the process. A plain `kill SIGTERM` only signals the process — launchd
# immediately restarts it because KeepAlive.SuccessfulExit = false.
# `hermes gateway start` re-bootstraps when it detects the job is unloaded.
try:
subprocess.run(["launchctl", "bootout", target], check=True, timeout=90)
except subprocess.CalledProcessError as e:
if e.returncode in (3, 113):
pass # Already unloaded — nothing to stop.
else:
raise
_wait_for_gateway_exit(timeout=10.0, force_after=5.0)
print("✓ Service stopped")
def _wait_for_gateway_exit(timeout: float = 10.0, force_after: float = 5.0):

View file

@ -234,6 +234,63 @@ class TestLaunchdServiceRecovery:
["launchctl", "kickstart", target],
]
def test_launchd_stop_uses_bootout_not_kill(self, monkeypatch):
"""launchd_stop must bootout the service so KeepAlive doesn't respawn it."""
label = gateway_cli.get_launchd_label()
domain = gateway_cli._launchd_domain()
target = f"{domain}/{label}"
calls = []
def fake_run(cmd, check=False, **kwargs):
calls.append(cmd)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli, "_wait_for_gateway_exit", lambda **kw: None)
gateway_cli.launchd_stop()
assert calls == [["launchctl", "bootout", target]]
def test_launchd_stop_tolerates_already_unloaded(self, monkeypatch, capsys):
"""launchd_stop silently handles exit codes 3/113 (job not loaded)."""
label = gateway_cli.get_launchd_label()
domain = gateway_cli._launchd_domain()
target = f"{domain}/{label}"
def fake_run(cmd, check=False, **kwargs):
if "bootout" in cmd:
raise gateway_cli.subprocess.CalledProcessError(3, cmd, stderr="Could not find service")
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli, "_wait_for_gateway_exit", lambda **kw: None)
# Should not raise — exit code 3 means already unloaded
gateway_cli.launchd_stop()
output = capsys.readouterr().out
assert "stopped" in output.lower()
def test_launchd_stop_waits_for_process_exit(self, monkeypatch):
"""launchd_stop calls _wait_for_gateway_exit after bootout."""
wait_called = []
def fake_run(cmd, check=False, **kwargs):
return SimpleNamespace(returncode=0, stdout="", stderr="")
def fake_wait(**kwargs):
wait_called.append(kwargs)
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
monkeypatch.setattr(gateway_cli, "_wait_for_gateway_exit", fake_wait)
gateway_cli.launchd_stop()
assert len(wait_called) == 1
assert wait_called[0] == {"timeout": 10.0, "force_after": 5.0}
def test_launchd_status_reports_local_stale_plist_when_unloaded(self, tmp_path, monkeypatch, capsys):
plist_path = tmp_path / "ai.hermes.gateway.plist"
plist_path.write_text("<plist>old content</plist>", encoding="utf-8")