From 327b57da91e5699564dd423bbb112ea95671b6e5 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:59:29 -0700 Subject: [PATCH] fix(gateway): kill tool subprocesses before adapter disconnect on drain timeout (#14728) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #8202. Root cause: stop() reclaimed tool-call bash/sleep children only at the very end of the shutdown sequence — after a 60s drain, 5s interrupt grace, and per-adapter disconnect. Under systemd (TimeoutStopSec bounded by drain_timeout), that meant the cgroup SIGKILL escalation fired first, and systemd reaped the bash/sleep children instead of us. Fix: - Extract tool-subprocess cleanup into a local helper _kill_tool_subprocesses() in _stop_impl(). - Invoke it eagerly right after _interrupt_running_agents() on the drain-timeout path, before adapter disconnect. - Keep the existing catch-all call at the end for the graceful path and defense in depth against mid-teardown respawns. - Bump generated systemd unit TimeoutStopSec to drain_timeout + 30s so cleanup + disconnect + DB close has headroom above the drain budget, matching the 'subprocess timeout > TimeoutStopSec + margin' rule from the skill. Tests: - New: test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout asserts kill_all() runs before disconnect() when drain times out. - New: test_gateway_stop_kills_tool_subprocesses_on_graceful_path guards that the final catch-all still fires when drain succeeds (regression guard against accidental removal during refactor). - Updated: existing systemd unit generator tests expect TimeoutStopSec=90 (= 60s drain + 30s headroom) with explanatory comment. --- gateway/run.py | 67 ++++++++++++++----- hermes_cli/gateway.py | 9 ++- tests/gateway/test_gateway_shutdown.py | 83 ++++++++++++++++++++++++ tests/hermes_cli/test_gateway_service.py | 10 ++- 4 files changed, 150 insertions(+), 19 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index dcee18e51..2c377980c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2560,6 +2560,40 @@ class GatewayRunner: return async def _stop_impl() -> None: + def _kill_tool_subprocesses(phase: str) -> None: + """Kill tool subprocesses + tear down terminal envs + browsers. + + Called twice in the shutdown path: once eagerly after a + drain timeout forces agent interrupt (so we reclaim bash/ + sleep children before systemd TimeoutStopSec escalates to + SIGKILL on the cgroup — #8202), and once as a final + catch-all at the end of _stop_impl() for the graceful + path or anything respawned mid-teardown. + + All steps are best-effort; exceptions are swallowed so + one subsystem's failure doesn't block the rest. + """ + try: + from tools.process_registry import process_registry + _killed = process_registry.kill_all() + if _killed: + logger.info( + "Shutdown (%s): killed %d tool subprocess(es)", + phase, _killed, + ) + except Exception as _e: + logger.debug("process_registry.kill_all (%s) error: %s", phase, _e) + try: + from tools.terminal_tool import cleanup_all_environments + cleanup_all_environments() + except Exception as _e: + logger.debug("cleanup_all_environments (%s) error: %s", phase, _e) + try: + from tools.browser_tool import cleanup_all_browsers + cleanup_all_browsers() + except Exception as _e: + logger.debug("cleanup_all_browsers (%s) error: %s", phase, _e) + logger.info( "Stopping gateway%s...", " for restart" if self._restart_requested else "", @@ -2621,6 +2655,16 @@ class GatewayRunner: self._update_runtime_status("draining") await asyncio.sleep(0.1) + # Kill lingering tool subprocesses NOW, before we spend more + # budget on adapter disconnect / session DB close. Under + # systemd (TimeoutStopSec bounded by drain_timeout+headroom), + # deferring this to the end of stop() risks systemd escalating + # to SIGKILL on the cgroup first — at which point bash/sleep + # children left behind by an interrupted terminal tool get + # killed by systemd instead of us (issue #8202). The final + # catch-all cleanup below still runs for the graceful path. + _kill_tool_subprocesses("post-interrupt") + if self._restart_requested and self._restart_detached: try: await self._launch_detached_restart_command() @@ -2656,22 +2700,13 @@ class GatewayRunner: self._shutdown_event.set() # Global cleanup: kill any remaining tool subprocesses not tied - # to a specific agent (catch-all for zombie prevention). - try: - from tools.process_registry import process_registry - process_registry.kill_all() - except Exception: - pass - try: - from tools.terminal_tool import cleanup_all_environments - cleanup_all_environments() - except Exception: - pass - try: - from tools.browser_tool import cleanup_all_browsers - cleanup_all_browsers() - except Exception: - pass + # to a specific agent (catch-all for zombie prevention). On the + # drain-timeout path we already did this earlier after agent + # interrupt — this second call catches (a) the graceful path + # where drain succeeded without interrupt, and (b) anything + # that got respawned between the earlier call and adapter + # disconnect (defense in depth; safe to call repeatedly). + _kill_tool_subprocesses("final-cleanup") # Close SQLite session DBs so the WAL write lock is released. # Without this, --replace and similar restart flows leave the diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 7796cc575..9773299d5 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -1469,7 +1469,14 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None) path_entries.append(resolved_node_dir) common_bin_paths = ["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"] - restart_timeout = max(60, int(_get_restart_drain_timeout() or 0)) + # systemd's TimeoutStopSec must exceed the gateway's drain_timeout so + # there's budget left for post-interrupt cleanup (tool subprocess kill, + # adapter disconnect, session DB close) before systemd escalates to + # SIGKILL on the cgroup — otherwise bash/sleep tool-call children left + # by a force-interrupted agent get reaped by systemd instead of us + # (#8202). 30s of headroom covers the worst case we've observed. + _drain_timeout = int(_get_restart_drain_timeout() or 0) + restart_timeout = max(60, _drain_timeout) + 30 if system: username, group_name, home_dir = _system_service_identity(run_as_user) diff --git a/tests/gateway/test_gateway_shutdown.py b/tests/gateway/test_gateway_shutdown.py index 4dc9919bc..137ddfd03 100644 --- a/tests/gateway/test_gateway_shutdown.py +++ b/tests/gateway/test_gateway_shutdown.py @@ -145,3 +145,86 @@ async def test_drain_active_agents_throttles_status_updates(): # Start, one count-change update, and final update. Allow one extra update # if the loop observes the zero-agent state before exiting. assert 3 <= runner._update_runtime_status.call_count <= 4 + + +@pytest.mark.asyncio +async def test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout(monkeypatch): + """On drain timeout, tool subprocesses must be killed BEFORE adapter + disconnect so systemd's TimeoutStopSec doesn't SIGKILL the cgroup with + bash/sleep children still attached (#8202).""" + runner, adapter = make_restart_runner() + runner._restart_drain_timeout = 0.01 # force timeout path + + call_order: list[str] = [] + + def _fake_kill_all(task_id=None): + call_order.append("kill_all") + return 2 + + def _fake_cleanup_envs(): + call_order.append("cleanup_environments") + + def _fake_cleanup_browsers(): + call_order.append("cleanup_browsers") + + async def _disconnect(): + call_order.append("disconnect") + + # Patch the module-level names the stop() helper imports lazily. + import tools.process_registry as _pr + import tools.terminal_tool as _tt + import tools.browser_tool as _bt + monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all) + monkeypatch.setattr(_tt, "cleanup_all_environments", _fake_cleanup_envs) + monkeypatch.setattr(_bt, "cleanup_all_browsers", _fake_cleanup_browsers) + + adapter.disconnect = _disconnect + + runner._running_agents = {"session": MagicMock()} + + with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"): + await runner.stop() + + # First kill_all must precede the first disconnect. (Both the eager + # post-interrupt cleanup and the final catch-all call _kill_tool_ + # subprocesses, so we expect kill_all to appear twice total.) + assert "kill_all" in call_order + assert "disconnect" in call_order + first_kill = call_order.index("kill_all") + first_disconnect = call_order.index("disconnect") + assert first_kill < first_disconnect, ( + f"Tool subprocesses must be killed before adapter disconnect on " + f"drain timeout, got order: {call_order}" + ) + # Defense-in-depth final cleanup still runs. + assert call_order.count("kill_all") >= 2 + + +@pytest.mark.asyncio +async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch): + """Graceful shutdown (no drain timeout) must still kill tool subprocesses + exactly once via the final catch-all — regression guard against + accidentally removing that call when refactoring.""" + runner, adapter = make_restart_runner() + adapter.disconnect = AsyncMock() + + kill_count = 0 + + def _fake_kill_all(task_id=None): + nonlocal kill_count + kill_count += 1 + return 0 + + import tools.process_registry as _pr + import tools.terminal_tool as _tt + import tools.browser_tool as _bt + monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all) + monkeypatch.setattr(_tt, "cleanup_all_environments", lambda: None) + monkeypatch.setattr(_bt, "cleanup_all_browsers", lambda: None) + + # No running agents → drain returns immediately, no timeout, no eager cleanup. + with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"): + await runner.stop() + + # Only the final catch-all fires on the graceful path. + assert kill_count == 1 diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 68554a496..bd429bff2 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -95,7 +95,10 @@ class TestGeneratedSystemdUnits: assert "ExecStop=" not in unit assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit - assert "TimeoutStopSec=60" in unit + # TimeoutStopSec must exceed the default drain_timeout (60s) so + # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup + # (tool subprocess kill, adapter disconnect) runs — issue #8202. + assert "TimeoutStopSec=90" in unit def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch): monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None) @@ -111,7 +114,10 @@ class TestGeneratedSystemdUnits: assert "ExecStop=" not in unit assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit - assert "TimeoutStopSec=60" in unit + # TimeoutStopSec must exceed the default drain_timeout (60s) so + # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup + # (tool subprocess kill, adapter disconnect) runs — issue #8202. + assert "TimeoutStopSec=90" in unit assert "WantedBy=multi-user.target" in unit