fix(gateway): reap cgroup orphans via ExecStopPost to unblock restart

Long-lived helpers spawned indirectly by tool calls (adb, platform
bridges) were left in the service cgroup after the gateway's main
process exited. When the kernel rejected the deferred cgroup-wide kill
with EINVAL, systemd blocked Restart=always for 6+ minutes, taking
down all platforms and cron windows (#37454).

Add a small ExecStopPost helper (gateway.cgroup_cleanup) that walks
cgroup.procs and sends per-PID SIGKILLs — a different kernel code path
than cgroup.kill, so it succeeds where the cgroup-wide write failed.
KillMode=mixed is preserved so the gateway still reaps its own
tool-call children before systemd intervenes (#8202).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
PRATHAMESH75 2026-06-02 23:14:16 +05:30 committed by Teknium
parent d7a1052424
commit e551da6ddb
4 changed files with 199 additions and 0 deletions

81
gateway/cgroup_cleanup.py Normal file
View file

@ -0,0 +1,81 @@
"""SIGKILL any process left in this systemd unit's cgroup.
Runs as ``ExecStopPost=`` so it only fires after the gateway's main process
has exited. The gateway already reaps its own tool subprocesses on a clean
shutdown; this is the safety net for long-lived helpers it doesn't track
(``adb``, platform bridges, etc.) that would otherwise be orphaned in the
cgroup and block ``Restart=always`` issue #37454.
We deliberately iterate ``cgroup.procs`` and send per-PID SIGKILLs instead
of writing ``1`` to ``cgroup.kill``: the original failure mode in #37454
was the kernel returning ``EINVAL`` on the cgroup-wide kill, while per-PID
signal delivery uses a separate code path that still works.
"""
from __future__ import annotations
import os
import re
import signal
import sys
from pathlib import Path
def _own_cgroup_path() -> str | None:
"""Return the cgroup v2 path for the calling process, or None."""
try:
text = Path("/proc/self/cgroup").read_text()
except OSError:
return None
match = re.search(r"^0::(.+)$", text, re.MULTILINE)
if not match:
return None
return match.group(1).strip()
def _read_cgroup_pids(cgroup_path: str) -> list[int]:
procs_file = Path(f"/sys/fs/cgroup{cgroup_path}/cgroup.procs")
try:
raw = procs_file.read_text()
except OSError:
return []
pids: list[int] = []
for line in raw.splitlines():
line = line.strip()
if not line:
continue
try:
pids.append(int(line))
except ValueError:
continue
return pids
def reap_cgroup(cgroup_path: str | None = None) -> int:
"""SIGKILL every PID in the cgroup other than the caller. Returns the count killed."""
if cgroup_path is None:
cgroup_path = _own_cgroup_path()
if not cgroup_path:
return 0
own = os.getpid()
killed = 0
for pid in _read_cgroup_pids(cgroup_path):
if pid == own:
continue
try:
os.kill(pid, signal.SIGKILL)
killed += 1
except ProcessLookupError:
continue
except PermissionError:
continue
return killed
def main() -> int:
reap_cgroup()
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -2706,6 +2706,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
KillMode=mixed
KillSignal=SIGTERM
ExecReload=/bin/kill -USR1 $MAINPID
ExecStopPost=-{python_path} -m gateway.cgroup_cleanup
TimeoutStopSec={restart_timeout}
StandardOutput=journal
StandardError=journal
@ -2739,6 +2740,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
KillMode=mixed
KillSignal=SIGTERM
ExecReload=/bin/kill -USR1 $MAINPID
ExecStopPost=-{python_path} -m gateway.cgroup_cleanup
TimeoutStopSec={restart_timeout}
StandardOutput=journal
StandardError=journal

View file

@ -0,0 +1,100 @@
"""Tests for the systemd ExecStopPost cgroup reaper (issue #37454)."""
from __future__ import annotations
import os
import signal
from pathlib import Path
import pytest
from gateway import cgroup_cleanup
class TestOwnCgroupPath:
def test_parses_v2_cgroup_path(self, tmp_path, monkeypatch):
proc_self = tmp_path / "cgroup"
proc_self.write_text("0::/user.slice/user-1000.slice/hermes-gateway.service\n")
monkeypatch.setattr(
cgroup_cleanup,
"Path",
lambda p: proc_self if p == "/proc/self/cgroup" else Path(p),
)
assert cgroup_cleanup._own_cgroup_path() == "/user.slice/user-1000.slice/hermes-gateway.service"
def test_returns_none_when_proc_missing(self, monkeypatch):
def _raise(_path):
raise FileNotFoundError
monkeypatch.setattr(cgroup_cleanup.Path, "read_text", lambda self: _raise(self))
assert cgroup_cleanup._own_cgroup_path() is None
class TestReapCgroup:
def test_skips_own_pid_and_kills_the_rest(self, tmp_path, monkeypatch):
own = os.getpid()
cgroup_path = "/test.slice/hermes-gateway.service"
procs_file = tmp_path / "cgroup.procs"
procs_file.write_text(f"{own}\n1001\n1002\n\n")
def _fake_path(p):
if p == f"/sys/fs/cgroup{cgroup_path}/cgroup.procs":
return procs_file
return Path(p)
monkeypatch.setattr(cgroup_cleanup, "Path", _fake_path)
killed_pids: list[tuple[int, int]] = []
monkeypatch.setattr(cgroup_cleanup.os, "kill", lambda pid, sig: killed_pids.append((pid, sig)))
count = cgroup_cleanup.reap_cgroup(cgroup_path)
assert count == 2
assert (own, signal.SIGKILL) not in killed_pids
assert (1001, signal.SIGKILL) in killed_pids
assert (1002, signal.SIGKILL) in killed_pids
def test_tolerates_already_exited_pids(self, tmp_path, monkeypatch):
cgroup_path = "/test.slice/hermes-gateway.service"
procs_file = tmp_path / "cgroup.procs"
procs_file.write_text("1001\n1002\n")
monkeypatch.setattr(
cgroup_cleanup,
"Path",
lambda p: procs_file if p.endswith("cgroup.procs") else Path(p),
)
def _kill(pid, _sig):
if pid == 1001:
raise ProcessLookupError
if pid == 1002:
raise PermissionError
monkeypatch.setattr(cgroup_cleanup.os, "kill", _kill)
assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0
def test_noop_when_cgroup_path_unknown(self, monkeypatch):
monkeypatch.setattr(cgroup_cleanup, "_own_cgroup_path", lambda: None)
def _explode(*_a, **_kw):
pytest.fail("os.kill must not be called when cgroup path is unknown")
monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode)
assert cgroup_cleanup.reap_cgroup() == 0
def test_noop_when_procs_file_missing(self, tmp_path, monkeypatch):
cgroup_path = "/missing.slice/hermes-gateway.service"
monkeypatch.setattr(
cgroup_cleanup,
"Path",
lambda p: tmp_path / "does-not-exist" if "cgroup.procs" in p else Path(p),
)
def _explode(*_a, **_kw):
pytest.fail("os.kill must not be called when cgroup.procs is unreadable")
monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode)
assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0

View file

@ -431,6 +431,14 @@ class TestGeneratedSystemdUnits:
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
assert self._expected_timeout_stop_sec() in unit
# ExecStopPost reaps any process the gateway didn't clean up itself,
# so long-lived helpers (e.g. adb) can't be left orphaned in the
# cgroup and block Restart=always — issue #37454.
assert "ExecStopPost=" in unit
assert "-m gateway.cgroup_cleanup" in unit
# KillMode=mixed is preserved so the gateway still reaps its own
# tool-call children before systemd SIGKILLs the cgroup — #8202.
assert "KillMode=mixed" in unit
def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
@ -493,6 +501,14 @@ class TestGeneratedSystemdUnits:
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
assert self._expected_timeout_stop_sec() in unit
assert "WantedBy=multi-user.target" in unit
# ExecStopPost reaps any process the gateway didn't clean up itself,
# so long-lived helpers (e.g. adb) can't be left orphaned in the
# cgroup and block Restart=always — issue #37454.
assert "ExecStopPost=" in unit
assert "-m gateway.cgroup_cleanup" in unit
# KillMode=mixed is preserved so the gateway still reaps its own
# tool-call children before systemd SIGKILLs the cgroup — #8202.
assert "KillMode=mixed" in unit
class TestGatewayStopCleanup: