diff --git a/gateway/cgroup_cleanup.py b/gateway/cgroup_cleanup.py new file mode 100644 index 00000000000..9a6225784d1 --- /dev/null +++ b/gateway/cgroup_cleanup.py @@ -0,0 +1,81 @@ +"""SIGKILL any process left in this systemd unit's cgroup. + +Runs as ``ExecStopPost=`` so it only fires after the gateway's main process +has exited. The gateway already reaps its own tool subprocesses on a clean +shutdown; this is the safety net for long-lived helpers it doesn't track +(``adb``, platform bridges, etc.) that would otherwise be orphaned in the +cgroup and block ``Restart=always`` — issue #37454. + +We deliberately iterate ``cgroup.procs`` and send per-PID SIGKILLs instead +of writing ``1`` to ``cgroup.kill``: the original failure mode in #37454 +was the kernel returning ``EINVAL`` on the cgroup-wide kill, while per-PID +signal delivery uses a separate code path that still works. +""" + +from __future__ import annotations + +import os +import re +import signal +import sys +from pathlib import Path + + +def _own_cgroup_path() -> str | None: + """Return the cgroup v2 path for the calling process, or None.""" + try: + text = Path("/proc/self/cgroup").read_text() + except OSError: + return None + match = re.search(r"^0::(.+)$", text, re.MULTILINE) + if not match: + return None + return match.group(1).strip() + + +def _read_cgroup_pids(cgroup_path: str) -> list[int]: + procs_file = Path(f"/sys/fs/cgroup{cgroup_path}/cgroup.procs") + try: + raw = procs_file.read_text() + except OSError: + return [] + pids: list[int] = [] + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + try: + pids.append(int(line)) + except ValueError: + continue + return pids + + +def reap_cgroup(cgroup_path: str | None = None) -> int: + """SIGKILL every PID in the cgroup other than the caller. Returns the count killed.""" + if cgroup_path is None: + cgroup_path = _own_cgroup_path() + if not cgroup_path: + return 0 + own = os.getpid() + killed = 0 + for pid in _read_cgroup_pids(cgroup_path): + if pid == own: + continue + try: + os.kill(pid, signal.SIGKILL) + killed += 1 + except ProcessLookupError: + continue + except PermissionError: + continue + return killed + + +def main() -> int: + reap_cgroup() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 1f0735ed025..4eb360d988e 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -2706,6 +2706,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM ExecReload=/bin/kill -USR1 $MAINPID +ExecStopPost=-{python_path} -m gateway.cgroup_cleanup TimeoutStopSec={restart_timeout} StandardOutput=journal StandardError=journal @@ -2739,6 +2740,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE} KillMode=mixed KillSignal=SIGTERM ExecReload=/bin/kill -USR1 $MAINPID +ExecStopPost=-{python_path} -m gateway.cgroup_cleanup TimeoutStopSec={restart_timeout} StandardOutput=journal StandardError=journal diff --git a/tests/gateway/test_cgroup_cleanup.py b/tests/gateway/test_cgroup_cleanup.py new file mode 100644 index 00000000000..e0395881311 --- /dev/null +++ b/tests/gateway/test_cgroup_cleanup.py @@ -0,0 +1,100 @@ +"""Tests for the systemd ExecStopPost cgroup reaper (issue #37454).""" + +from __future__ import annotations + +import os +import signal +from pathlib import Path + +import pytest + +from gateway import cgroup_cleanup + + +class TestOwnCgroupPath: + def test_parses_v2_cgroup_path(self, tmp_path, monkeypatch): + proc_self = tmp_path / "cgroup" + proc_self.write_text("0::/user.slice/user-1000.slice/hermes-gateway.service\n") + monkeypatch.setattr( + cgroup_cleanup, + "Path", + lambda p: proc_self if p == "/proc/self/cgroup" else Path(p), + ) + + assert cgroup_cleanup._own_cgroup_path() == "/user.slice/user-1000.slice/hermes-gateway.service" + + def test_returns_none_when_proc_missing(self, monkeypatch): + def _raise(_path): + raise FileNotFoundError + + monkeypatch.setattr(cgroup_cleanup.Path, "read_text", lambda self: _raise(self)) + assert cgroup_cleanup._own_cgroup_path() is None + + +class TestReapCgroup: + def test_skips_own_pid_and_kills_the_rest(self, tmp_path, monkeypatch): + own = os.getpid() + cgroup_path = "/test.slice/hermes-gateway.service" + procs_file = tmp_path / "cgroup.procs" + procs_file.write_text(f"{own}\n1001\n1002\n\n") + + def _fake_path(p): + if p == f"/sys/fs/cgroup{cgroup_path}/cgroup.procs": + return procs_file + return Path(p) + + monkeypatch.setattr(cgroup_cleanup, "Path", _fake_path) + + killed_pids: list[tuple[int, int]] = [] + monkeypatch.setattr(cgroup_cleanup.os, "kill", lambda pid, sig: killed_pids.append((pid, sig))) + + count = cgroup_cleanup.reap_cgroup(cgroup_path) + + assert count == 2 + assert (own, signal.SIGKILL) not in killed_pids + assert (1001, signal.SIGKILL) in killed_pids + assert (1002, signal.SIGKILL) in killed_pids + + def test_tolerates_already_exited_pids(self, tmp_path, monkeypatch): + cgroup_path = "/test.slice/hermes-gateway.service" + procs_file = tmp_path / "cgroup.procs" + procs_file.write_text("1001\n1002\n") + + monkeypatch.setattr( + cgroup_cleanup, + "Path", + lambda p: procs_file if p.endswith("cgroup.procs") else Path(p), + ) + + def _kill(pid, _sig): + if pid == 1001: + raise ProcessLookupError + if pid == 1002: + raise PermissionError + + monkeypatch.setattr(cgroup_cleanup.os, "kill", _kill) + + assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0 + + def test_noop_when_cgroup_path_unknown(self, monkeypatch): + monkeypatch.setattr(cgroup_cleanup, "_own_cgroup_path", lambda: None) + + def _explode(*_a, **_kw): + pytest.fail("os.kill must not be called when cgroup path is unknown") + + monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode) + assert cgroup_cleanup.reap_cgroup() == 0 + + def test_noop_when_procs_file_missing(self, tmp_path, monkeypatch): + cgroup_path = "/missing.slice/hermes-gateway.service" + monkeypatch.setattr( + cgroup_cleanup, + "Path", + lambda p: tmp_path / "does-not-exist" if "cgroup.procs" in p else Path(p), + ) + + def _explode(*_a, **_kw): + pytest.fail("os.kill must not be called when cgroup.procs is unreadable") + + monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode) + assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0 diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index a4f7bac5b3b..fa7d2e9827f 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -431,6 +431,14 @@ class TestGeneratedSystemdUnits: # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup # (tool subprocess kill, adapter disconnect) runs — issue #8202. assert self._expected_timeout_stop_sec() in unit + # ExecStopPost reaps any process the gateway didn't clean up itself, + # so long-lived helpers (e.g. adb) can't be left orphaned in the + # cgroup and block Restart=always — issue #37454. + assert "ExecStopPost=" in unit + assert "-m gateway.cgroup_cleanup" in unit + # KillMode=mixed is preserved so the gateway still reaps its own + # tool-call children before systemd SIGKILLs the cgroup — #8202. + assert "KillMode=mixed" in unit def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch): monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None) @@ -493,6 +501,14 @@ class TestGeneratedSystemdUnits: # (tool subprocess kill, adapter disconnect) runs — issue #8202. assert self._expected_timeout_stop_sec() in unit assert "WantedBy=multi-user.target" in unit + # ExecStopPost reaps any process the gateway didn't clean up itself, + # so long-lived helpers (e.g. adb) can't be left orphaned in the + # cgroup and block Restart=always — issue #37454. + assert "ExecStopPost=" in unit + assert "-m gateway.cgroup_cleanup" in unit + # KillMode=mixed is preserved so the gateway still reaps its own + # tool-call children before systemd SIGKILLs the cgroup — #8202. + assert "KillMode=mixed" in unit class TestGatewayStopCleanup: