mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(gateway): reap cgroup orphans via ExecStopPost to unblock restart
Long-lived helpers spawned indirectly by tool calls (adb, platform bridges) were left in the service cgroup after the gateway's main process exited. When the kernel rejected the deferred cgroup-wide kill with EINVAL, systemd blocked Restart=always for 6+ minutes, taking down all platforms and cron windows (#37454). Add a small ExecStopPost helper (gateway.cgroup_cleanup) that walks cgroup.procs and sends per-PID SIGKILLs — a different kernel code path than cgroup.kill, so it succeeds where the cgroup-wide write failed. KillMode=mixed is preserved so the gateway still reaps its own tool-call children before systemd intervenes (#8202). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
d7a1052424
commit
e551da6ddb
4 changed files with 199 additions and 0 deletions
81
gateway/cgroup_cleanup.py
Normal file
81
gateway/cgroup_cleanup.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
"""SIGKILL any process left in this systemd unit's cgroup.
|
||||
|
||||
Runs as ``ExecStopPost=`` so it only fires after the gateway's main process
|
||||
has exited. The gateway already reaps its own tool subprocesses on a clean
|
||||
shutdown; this is the safety net for long-lived helpers it doesn't track
|
||||
(``adb``, platform bridges, etc.) that would otherwise be orphaned in the
|
||||
cgroup and block ``Restart=always`` — issue #37454.
|
||||
|
||||
We deliberately iterate ``cgroup.procs`` and send per-PID SIGKILLs instead
|
||||
of writing ``1`` to ``cgroup.kill``: the original failure mode in #37454
|
||||
was the kernel returning ``EINVAL`` on the cgroup-wide kill, while per-PID
|
||||
signal delivery uses a separate code path that still works.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _own_cgroup_path() -> str | None:
|
||||
"""Return the cgroup v2 path for the calling process, or None."""
|
||||
try:
|
||||
text = Path("/proc/self/cgroup").read_text()
|
||||
except OSError:
|
||||
return None
|
||||
match = re.search(r"^0::(.+)$", text, re.MULTILINE)
|
||||
if not match:
|
||||
return None
|
||||
return match.group(1).strip()
|
||||
|
||||
|
||||
def _read_cgroup_pids(cgroup_path: str) -> list[int]:
|
||||
procs_file = Path(f"/sys/fs/cgroup{cgroup_path}/cgroup.procs")
|
||||
try:
|
||||
raw = procs_file.read_text()
|
||||
except OSError:
|
||||
return []
|
||||
pids: list[int] = []
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
pids.append(int(line))
|
||||
except ValueError:
|
||||
continue
|
||||
return pids
|
||||
|
||||
|
||||
def reap_cgroup(cgroup_path: str | None = None) -> int:
|
||||
"""SIGKILL every PID in the cgroup other than the caller. Returns the count killed."""
|
||||
if cgroup_path is None:
|
||||
cgroup_path = _own_cgroup_path()
|
||||
if not cgroup_path:
|
||||
return 0
|
||||
own = os.getpid()
|
||||
killed = 0
|
||||
for pid in _read_cgroup_pids(cgroup_path):
|
||||
if pid == own:
|
||||
continue
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
killed += 1
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
except PermissionError:
|
||||
continue
|
||||
return killed
|
||||
|
||||
|
||||
def main() -> int:
|
||||
reap_cgroup()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -2706,6 +2706,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
|||
KillMode=mixed
|
||||
KillSignal=SIGTERM
|
||||
ExecReload=/bin/kill -USR1 $MAINPID
|
||||
ExecStopPost=-{python_path} -m gateway.cgroup_cleanup
|
||||
TimeoutStopSec={restart_timeout}
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
|
@ -2739,6 +2740,7 @@ RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}
|
|||
KillMode=mixed
|
||||
KillSignal=SIGTERM
|
||||
ExecReload=/bin/kill -USR1 $MAINPID
|
||||
ExecStopPost=-{python_path} -m gateway.cgroup_cleanup
|
||||
TimeoutStopSec={restart_timeout}
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
|
|
|||
100
tests/gateway/test_cgroup_cleanup.py
Normal file
100
tests/gateway/test_cgroup_cleanup.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
"""Tests for the systemd ExecStopPost cgroup reaper (issue #37454)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway import cgroup_cleanup
|
||||
|
||||
|
||||
class TestOwnCgroupPath:
|
||||
def test_parses_v2_cgroup_path(self, tmp_path, monkeypatch):
|
||||
proc_self = tmp_path / "cgroup"
|
||||
proc_self.write_text("0::/user.slice/user-1000.slice/hermes-gateway.service\n")
|
||||
monkeypatch.setattr(
|
||||
cgroup_cleanup,
|
||||
"Path",
|
||||
lambda p: proc_self if p == "/proc/self/cgroup" else Path(p),
|
||||
)
|
||||
|
||||
assert cgroup_cleanup._own_cgroup_path() == "/user.slice/user-1000.slice/hermes-gateway.service"
|
||||
|
||||
def test_returns_none_when_proc_missing(self, monkeypatch):
|
||||
def _raise(_path):
|
||||
raise FileNotFoundError
|
||||
|
||||
monkeypatch.setattr(cgroup_cleanup.Path, "read_text", lambda self: _raise(self))
|
||||
assert cgroup_cleanup._own_cgroup_path() is None
|
||||
|
||||
|
||||
class TestReapCgroup:
|
||||
def test_skips_own_pid_and_kills_the_rest(self, tmp_path, monkeypatch):
|
||||
own = os.getpid()
|
||||
cgroup_path = "/test.slice/hermes-gateway.service"
|
||||
procs_file = tmp_path / "cgroup.procs"
|
||||
procs_file.write_text(f"{own}\n1001\n1002\n\n")
|
||||
|
||||
def _fake_path(p):
|
||||
if p == f"/sys/fs/cgroup{cgroup_path}/cgroup.procs":
|
||||
return procs_file
|
||||
return Path(p)
|
||||
|
||||
monkeypatch.setattr(cgroup_cleanup, "Path", _fake_path)
|
||||
|
||||
killed_pids: list[tuple[int, int]] = []
|
||||
monkeypatch.setattr(cgroup_cleanup.os, "kill", lambda pid, sig: killed_pids.append((pid, sig)))
|
||||
|
||||
count = cgroup_cleanup.reap_cgroup(cgroup_path)
|
||||
|
||||
assert count == 2
|
||||
assert (own, signal.SIGKILL) not in killed_pids
|
||||
assert (1001, signal.SIGKILL) in killed_pids
|
||||
assert (1002, signal.SIGKILL) in killed_pids
|
||||
|
||||
def test_tolerates_already_exited_pids(self, tmp_path, monkeypatch):
|
||||
cgroup_path = "/test.slice/hermes-gateway.service"
|
||||
procs_file = tmp_path / "cgroup.procs"
|
||||
procs_file.write_text("1001\n1002\n")
|
||||
|
||||
monkeypatch.setattr(
|
||||
cgroup_cleanup,
|
||||
"Path",
|
||||
lambda p: procs_file if p.endswith("cgroup.procs") else Path(p),
|
||||
)
|
||||
|
||||
def _kill(pid, _sig):
|
||||
if pid == 1001:
|
||||
raise ProcessLookupError
|
||||
if pid == 1002:
|
||||
raise PermissionError
|
||||
|
||||
monkeypatch.setattr(cgroup_cleanup.os, "kill", _kill)
|
||||
|
||||
assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0
|
||||
|
||||
def test_noop_when_cgroup_path_unknown(self, monkeypatch):
|
||||
monkeypatch.setattr(cgroup_cleanup, "_own_cgroup_path", lambda: None)
|
||||
|
||||
def _explode(*_a, **_kw):
|
||||
pytest.fail("os.kill must not be called when cgroup path is unknown")
|
||||
|
||||
monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode)
|
||||
assert cgroup_cleanup.reap_cgroup() == 0
|
||||
|
||||
def test_noop_when_procs_file_missing(self, tmp_path, monkeypatch):
|
||||
cgroup_path = "/missing.slice/hermes-gateway.service"
|
||||
monkeypatch.setattr(
|
||||
cgroup_cleanup,
|
||||
"Path",
|
||||
lambda p: tmp_path / "does-not-exist" if "cgroup.procs" in p else Path(p),
|
||||
)
|
||||
|
||||
def _explode(*_a, **_kw):
|
||||
pytest.fail("os.kill must not be called when cgroup.procs is unreadable")
|
||||
|
||||
monkeypatch.setattr(cgroup_cleanup.os, "kill", _explode)
|
||||
assert cgroup_cleanup.reap_cgroup(cgroup_path) == 0
|
||||
|
|
@ -431,6 +431,14 @@ class TestGeneratedSystemdUnits:
|
|||
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
|
||||
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
|
||||
assert self._expected_timeout_stop_sec() in unit
|
||||
# ExecStopPost reaps any process the gateway didn't clean up itself,
|
||||
# so long-lived helpers (e.g. adb) can't be left orphaned in the
|
||||
# cgroup and block Restart=always — issue #37454.
|
||||
assert "ExecStopPost=" in unit
|
||||
assert "-m gateway.cgroup_cleanup" in unit
|
||||
# KillMode=mixed is preserved so the gateway still reaps its own
|
||||
# tool-call children before systemd SIGKILLs the cgroup — #8202.
|
||||
assert "KillMode=mixed" in unit
|
||||
|
||||
def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
|
||||
|
|
@ -493,6 +501,14 @@ class TestGeneratedSystemdUnits:
|
|||
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
|
||||
assert self._expected_timeout_stop_sec() in unit
|
||||
assert "WantedBy=multi-user.target" in unit
|
||||
# ExecStopPost reaps any process the gateway didn't clean up itself,
|
||||
# so long-lived helpers (e.g. adb) can't be left orphaned in the
|
||||
# cgroup and block Restart=always — issue #37454.
|
||||
assert "ExecStopPost=" in unit
|
||||
assert "-m gateway.cgroup_cleanup" in unit
|
||||
# KillMode=mixed is preserved so the gateway still reaps its own
|
||||
# tool-call children before systemd SIGKILLs the cgroup — #8202.
|
||||
assert "KillMode=mixed" in unit
|
||||
|
||||
|
||||
class TestGatewayStopCleanup:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue