mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 01:21:43 +00:00
fix: hermes update kills freshly-restarted gateway service
After restarting a service-managed gateway (systemd/launchd), the stale-process sweep calls find_gateway_pids() which returns ALL gateway PIDs via ps aux — including the one just spawned by the service manager. The sweep kills it, leaving the user with a stopped gateway and a confusing 'Restart manually' message. Fix: add _get_service_pids() to query systemd MainPID and launchd PID for active gateway services, then exclude those PIDs from the sweep. Also add exclude_pids parameter to find_gateway_pids() and kill_gateway_processes() so callers can skip known service-managed PIDs. Adds 9 targeted tests covering: - _get_service_pids() for systemd, launchd, empty, and zero-PID cases - find_gateway_pids() exclude_pids filtering - cmd_update integration: service PID not killed after restart - cmd_update integration: manual PID killed while service PID preserved
This commit is contained in:
parent
9c96f669a1
commit
a2a9ad7431
3 changed files with 371 additions and 9 deletions
|
|
@ -491,3 +491,263 @@ class TestCmdUpdateSystemService:
|
|||
captured = capsys.readouterr().out
|
||||
# Both scopes are discovered and restarted
|
||||
assert "Restarted hermes-gateway" in captured
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Service PID exclusion — the core bug fix
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestServicePidExclusion:
|
||||
"""After restarting a service, the stale-process sweep must NOT kill
|
||||
the freshly-spawned service PID. This was the root cause of the bug
|
||||
where ``hermes update`` would restart the gateway and immediately kill it.
|
||||
"""
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_launchd_does_not_kill_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
|
||||
):
|
||||
"""After launchd restart, the sweep must exclude the service PID."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist/>")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
|
||||
# The service PID that launchd manages after restart
|
||||
SERVICE_PID = 42000
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
# Simulate find_gateway_pids returning the service PID (the bug scenario)
|
||||
# and _get_service_pids returning the same PID to exclude it
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids",
|
||||
side_effect=lambda exclude_pids=None: (
|
||||
[SERVICE_PID] if not exclude_pids else
|
||||
[p for p in [SERVICE_PID] if p not in exclude_pids]
|
||||
),
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
# Service was restarted
|
||||
assert "Restarted" in captured
|
||||
# The service PID should NOT have been killed by the manual sweep
|
||||
kill_calls = [
|
||||
c for c in mock_kill.call_args_list
|
||||
if c.args[0] == SERVICE_PID
|
||||
]
|
||||
assert len(kill_calls) == 0, (
|
||||
f"Service PID {SERVICE_PID} was killed by the manual sweep — "
|
||||
f"this is the bug where update restarts then immediately kills the gateway"
|
||||
)
|
||||
# Should NOT show manual restart message
|
||||
assert "Restart manually" not in captured
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_systemd_does_not_kill_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
|
||||
):
|
||||
"""After systemd restart, the sweep must exclude the service PID."""
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
|
||||
SERVICE_PID = 55000
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
systemd_active=True,
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids",
|
||||
side_effect=lambda exclude_pids=None: (
|
||||
[SERVICE_PID] if not exclude_pids else
|
||||
[p for p in [SERVICE_PID] if p not in exclude_pids]
|
||||
),
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
assert "Restarted hermes-gateway" in captured
|
||||
# Service PID must not be killed
|
||||
kill_calls = [
|
||||
c for c in mock_kill.call_args_list
|
||||
if c.args[0] == SERVICE_PID
|
||||
]
|
||||
assert len(kill_calls) == 0
|
||||
assert "Restart manually" not in captured
|
||||
|
||||
@patch("shutil.which", return_value=None)
|
||||
@patch("subprocess.run")
|
||||
def test_update_kills_manual_pid_but_not_service_pid(
|
||||
self, mock_run, _mock_which, mock_args, capsys, monkeypatch, tmp_path,
|
||||
):
|
||||
"""When both a service PID and a manual PID exist, only the manual one
|
||||
is killed."""
|
||||
plist_path = tmp_path / "ai.hermes.gateway.plist"
|
||||
plist_path.write_text("<plist/>")
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
|
||||
|
||||
SERVICE_PID = 42000
|
||||
MANUAL_PID = 42999
|
||||
|
||||
mock_run.side_effect = _make_run_side_effect(
|
||||
commit_count="3",
|
||||
launchctl_loaded=True,
|
||||
)
|
||||
|
||||
def fake_find(exclude_pids=None):
|
||||
_exclude = exclude_pids or set()
|
||||
return [p for p in [SERVICE_PID, MANUAL_PID] if p not in _exclude]
|
||||
|
||||
with patch.object(
|
||||
gateway_cli, "_get_service_pids", return_value={SERVICE_PID}
|
||||
), patch.object(
|
||||
gateway_cli, "find_gateway_pids", side_effect=fake_find,
|
||||
), patch("os.kill") as mock_kill:
|
||||
cmd_update(mock_args)
|
||||
|
||||
captured = capsys.readouterr().out
|
||||
assert "Restarted" in captured
|
||||
# Manual PID should be killed
|
||||
manual_kills = [c for c in mock_kill.call_args_list if c.args[0] == MANUAL_PID]
|
||||
assert len(manual_kills) == 1
|
||||
# Service PID should NOT be killed
|
||||
service_kills = [c for c in mock_kill.call_args_list if c.args[0] == SERVICE_PID]
|
||||
assert len(service_kills) == 0
|
||||
# Should show manual stop message since manual PID was killed
|
||||
assert "Stopped 1 manual gateway" in captured
|
||||
|
||||
|
||||
class TestGetServicePids:
|
||||
"""Unit tests for _get_service_pids()."""
|
||||
|
||||
def test_returns_systemd_main_pid(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "list-units" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded active running Hermes Gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
if "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="12345\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 12345 in pids
|
||||
|
||||
def test_returns_launchd_pid(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: True)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "launchctl" in joined and "list" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="PID\tStatus\tLabel\n67890\t0\tai.hermes.gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 67890 in pids
|
||||
|
||||
def test_returns_empty_when_no_services(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: False)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert pids == set()
|
||||
|
||||
def test_excludes_zero_pid(self, monkeypatch):
|
||||
"""systemd returns MainPID=0 for stopped services; skip those."""
|
||||
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
joined = " ".join(str(c) for c in cmd)
|
||||
if "list-units" in joined:
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="hermes-gateway.service loaded inactive dead Hermes Gateway\n",
|
||||
stderr="",
|
||||
)
|
||||
if "show" in joined and "MainPID" in joined:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="0\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
|
||||
pids = gateway_cli._get_service_pids()
|
||||
assert 0 not in pids
|
||||
assert pids == set()
|
||||
|
||||
|
||||
class TestFindGatewayPidsExclude:
|
||||
"""find_gateway_pids respects exclude_pids."""
|
||||
|
||||
def test_excludes_specified_pids(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout=(
|
||||
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
),
|
||||
stderr="",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr("os.getpid", lambda: 999)
|
||||
|
||||
pids = gateway_cli.find_gateway_pids(exclude_pids={100})
|
||||
assert 100 not in pids
|
||||
assert 200 in pids
|
||||
|
||||
def test_no_exclude_returns_all(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli, "is_windows", lambda: False)
|
||||
|
||||
def fake_run(cmd, **kwargs):
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout=(
|
||||
"user 100 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
"user 200 0.0 0.0 0 0 ? S 00:00 0:00 python gateway/run.py\n"
|
||||
),
|
||||
stderr="",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr("os.getpid", lambda: 999)
|
||||
|
||||
pids = gateway_cli.find_gateway_pids()
|
||||
assert 100 in pids
|
||||
assert 200 in pids
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue