fix(gateway): kill tool subprocesses before adapter disconnect on drain timeout (#14728)

Closes #8202.

Root cause: stop() reclaimed tool-call bash/sleep children only at the
very end of the shutdown sequence — after a 60s drain, 5s interrupt
grace, and per-adapter disconnect. Under systemd (TimeoutStopSec bounded
by drain_timeout), that meant the cgroup SIGKILL escalation fired first,
and systemd reaped the bash/sleep children instead of us.

Fix:
- Extract tool-subprocess cleanup into a local helper
  _kill_tool_subprocesses() in _stop_impl().
- Invoke it eagerly right after _interrupt_running_agents() on the
  drain-timeout path, before adapter disconnect.
- Keep the existing catch-all call at the end for the graceful path
  and defense in depth against mid-teardown respawns.
- Bump generated systemd unit TimeoutStopSec to drain_timeout + 30s
  so cleanup + disconnect + DB close has headroom above the drain
  budget, matching the 'subprocess timeout > TimeoutStopSec + margin'
  rule from the skill.

Tests:
- New: test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout
  asserts kill_all() runs before disconnect() when drain times out.
- New: test_gateway_stop_kills_tool_subprocesses_on_graceful_path
  guards that the final catch-all still fires when drain succeeds
  (regression guard against accidental removal during refactor).
- Updated: existing systemd unit generator tests expect TimeoutStopSec=90
  (= 60s drain + 30s headroom) with explanatory comment.
This commit is contained in:
Teknium 2026-04-23 13:59:29 -07:00 committed by GitHub
parent 64e6165686
commit 327b57da91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 150 additions and 19 deletions

View file

@ -2560,6 +2560,40 @@ class GatewayRunner:
return
async def _stop_impl() -> None:
def _kill_tool_subprocesses(phase: str) -> None:
"""Kill tool subprocesses + tear down terminal envs + browsers.
Called twice in the shutdown path: once eagerly after a
drain timeout forces agent interrupt (so we reclaim bash/
sleep children before systemd TimeoutStopSec escalates to
SIGKILL on the cgroup #8202), and once as a final
catch-all at the end of _stop_impl() for the graceful
path or anything respawned mid-teardown.
All steps are best-effort; exceptions are swallowed so
one subsystem's failure doesn't block the rest.
"""
try:
from tools.process_registry import process_registry
_killed = process_registry.kill_all()
if _killed:
logger.info(
"Shutdown (%s): killed %d tool subprocess(es)",
phase, _killed,
)
except Exception as _e:
logger.debug("process_registry.kill_all (%s) error: %s", phase, _e)
try:
from tools.terminal_tool import cleanup_all_environments
cleanup_all_environments()
except Exception as _e:
logger.debug("cleanup_all_environments (%s) error: %s", phase, _e)
try:
from tools.browser_tool import cleanup_all_browsers
cleanup_all_browsers()
except Exception as _e:
logger.debug("cleanup_all_browsers (%s) error: %s", phase, _e)
logger.info(
"Stopping gateway%s...",
" for restart" if self._restart_requested else "",
@ -2621,6 +2655,16 @@ class GatewayRunner:
self._update_runtime_status("draining")
await asyncio.sleep(0.1)
# Kill lingering tool subprocesses NOW, before we spend more
# budget on adapter disconnect / session DB close. Under
# systemd (TimeoutStopSec bounded by drain_timeout+headroom),
# deferring this to the end of stop() risks systemd escalating
# to SIGKILL on the cgroup first — at which point bash/sleep
# children left behind by an interrupted terminal tool get
# killed by systemd instead of us (issue #8202). The final
# catch-all cleanup below still runs for the graceful path.
_kill_tool_subprocesses("post-interrupt")
if self._restart_requested and self._restart_detached:
try:
await self._launch_detached_restart_command()
@ -2656,22 +2700,13 @@ class GatewayRunner:
self._shutdown_event.set()
# Global cleanup: kill any remaining tool subprocesses not tied
# to a specific agent (catch-all for zombie prevention).
try:
from tools.process_registry import process_registry
process_registry.kill_all()
except Exception:
pass
try:
from tools.terminal_tool import cleanup_all_environments
cleanup_all_environments()
except Exception:
pass
try:
from tools.browser_tool import cleanup_all_browsers
cleanup_all_browsers()
except Exception:
pass
# to a specific agent (catch-all for zombie prevention). On the
# drain-timeout path we already did this earlier after agent
# interrupt — this second call catches (a) the graceful path
# where drain succeeded without interrupt, and (b) anything
# that got respawned between the earlier call and adapter
# disconnect (defense in depth; safe to call repeatedly).
_kill_tool_subprocesses("final-cleanup")
# Close SQLite session DBs so the WAL write lock is released.
# Without this, --replace and similar restart flows leave the

View file

@ -1469,7 +1469,14 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
path_entries.append(resolved_node_dir)
common_bin_paths = ["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"]
restart_timeout = max(60, int(_get_restart_drain_timeout() or 0))
# systemd's TimeoutStopSec must exceed the gateway's drain_timeout so
# there's budget left for post-interrupt cleanup (tool subprocess kill,
# adapter disconnect, session DB close) before systemd escalates to
# SIGKILL on the cgroup — otherwise bash/sleep tool-call children left
# by a force-interrupted agent get reaped by systemd instead of us
# (#8202). 30s of headroom covers the worst case we've observed.
_drain_timeout = int(_get_restart_drain_timeout() or 0)
restart_timeout = max(60, _drain_timeout) + 30
if system:
username, group_name, home_dir = _system_service_identity(run_as_user)

View file

@ -145,3 +145,86 @@ async def test_drain_active_agents_throttles_status_updates():
# Start, one count-change update, and final update. Allow one extra update
# if the loop observes the zero-agent state before exiting.
assert 3 <= runner._update_runtime_status.call_count <= 4
@pytest.mark.asyncio
async def test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout(monkeypatch):
"""On drain timeout, tool subprocesses must be killed BEFORE adapter
disconnect so systemd's TimeoutStopSec doesn't SIGKILL the cgroup with
bash/sleep children still attached (#8202)."""
runner, adapter = make_restart_runner()
runner._restart_drain_timeout = 0.01 # force timeout path
call_order: list[str] = []
def _fake_kill_all(task_id=None):
call_order.append("kill_all")
return 2
def _fake_cleanup_envs():
call_order.append("cleanup_environments")
def _fake_cleanup_browsers():
call_order.append("cleanup_browsers")
async def _disconnect():
call_order.append("disconnect")
# Patch the module-level names the stop() helper imports lazily.
import tools.process_registry as _pr
import tools.terminal_tool as _tt
import tools.browser_tool as _bt
monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
monkeypatch.setattr(_tt, "cleanup_all_environments", _fake_cleanup_envs)
monkeypatch.setattr(_bt, "cleanup_all_browsers", _fake_cleanup_browsers)
adapter.disconnect = _disconnect
runner._running_agents = {"session": MagicMock()}
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
await runner.stop()
# First kill_all must precede the first disconnect. (Both the eager
# post-interrupt cleanup and the final catch-all call _kill_tool_
# subprocesses, so we expect kill_all to appear twice total.)
assert "kill_all" in call_order
assert "disconnect" in call_order
first_kill = call_order.index("kill_all")
first_disconnect = call_order.index("disconnect")
assert first_kill < first_disconnect, (
f"Tool subprocesses must be killed before adapter disconnect on "
f"drain timeout, got order: {call_order}"
)
# Defense-in-depth final cleanup still runs.
assert call_order.count("kill_all") >= 2
@pytest.mark.asyncio
async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch):
"""Graceful shutdown (no drain timeout) must still kill tool subprocesses
exactly once via the final catch-all regression guard against
accidentally removing that call when refactoring."""
runner, adapter = make_restart_runner()
adapter.disconnect = AsyncMock()
kill_count = 0
def _fake_kill_all(task_id=None):
nonlocal kill_count
kill_count += 1
return 0
import tools.process_registry as _pr
import tools.terminal_tool as _tt
import tools.browser_tool as _bt
monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
monkeypatch.setattr(_tt, "cleanup_all_environments", lambda: None)
monkeypatch.setattr(_bt, "cleanup_all_browsers", lambda: None)
# No running agents → drain returns immediately, no timeout, no eager cleanup.
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
await runner.stop()
# Only the final catch-all fires on the graceful path.
assert kill_count == 1

View file

@ -95,7 +95,10 @@ class TestGeneratedSystemdUnits:
assert "ExecStop=" not in unit
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
assert "TimeoutStopSec=60" in unit
# TimeoutStopSec must exceed the default drain_timeout (60s) so
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
assert "TimeoutStopSec=90" in unit
def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
@ -111,7 +114,10 @@ class TestGeneratedSystemdUnits:
assert "ExecStop=" not in unit
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
assert "TimeoutStopSec=60" in unit
# TimeoutStopSec must exceed the default drain_timeout (60s) so
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
assert "TimeoutStopSec=90" in unit
assert "WantedBy=multi-user.target" in unit