fix(gateway): kill tool subprocesses before adapter disconnect on drain timeout (#14728)

Closes #8202. Root cause: stop() reclaimed tool-call bash/sleep children only at the very end of the shutdown sequence — after a 60s drain, 5s interrupt grace, and per-adapter disconnect. Under systemd (TimeoutStopSec bounded by drain_timeout), that meant the cgroup SIGKILL escalation fired first, and systemd reaped the bash/sleep children instead of us. Fix: - Extract tool-subprocess cleanup into a local helper _kill_tool_subprocesses() in _stop_impl(). - Invoke it eagerly right after _interrupt_running_agents() on the drain-timeout path, before adapter disconnect. - Keep the existing catch-all call at the end for the graceful path and defense in depth against mid-teardown respawns. - Bump generated systemd unit TimeoutStopSec to drain_timeout + 30s so cleanup + disconnect + DB close has headroom above the drain budget, matching the 'subprocess timeout > TimeoutStopSec + margin' rule from the skill. Tests: - New: test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout asserts kill_all() runs before disconnect() when drain times out. - New: test_gateway_stop_kills_tool_subprocesses_on_graceful_path guards that the final catch-all still fires when drain succeeds (regression guard against accidental removal during refactor). - Updated: existing systemd unit generator tests expect TimeoutStopSec=90 (= 60s drain + 30s headroom) with explanatory comment.
2026-07-25 17:18:11 +00:00 · 2026-04-23 13:59:29 -07:00 · 2026-04-23 13:59:29 -07:00 · 327b57da91
commit 327b57da91
parent 64e6165686
4 changed files with 150 additions and 19 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -2560,6 +2560,40 @@ class GatewayRunner:
            return

        async def _stop_impl() -> None:
+            def _kill_tool_subprocesses(phase: str) -> None:
+                """Kill tool subprocesses + tear down terminal envs + browsers.
+
+                Called twice in the shutdown path: once eagerly after a
+                drain timeout forces agent interrupt (so we reclaim bash/
+                sleep children before systemd TimeoutStopSec escalates to
+                SIGKILL on the cgroup — #8202), and once as a final
+                catch-all at the end of _stop_impl() for the graceful
+                path or anything respawned mid-teardown.
+
+                All steps are best-effort; exceptions are swallowed so
+                one subsystem's failure doesn't block the rest.
+                """
+                try:
+                    from tools.process_registry import process_registry
+                    _killed = process_registry.kill_all()
+                    if _killed:
+                        logger.info(
+                            "Shutdown (%s): killed %d tool subprocess(es)",
+                            phase, _killed,
+                        )
+                except Exception as _e:
+                    logger.debug("process_registry.kill_all (%s) error: %s", phase, _e)
+                try:
+                    from tools.terminal_tool import cleanup_all_environments
+                    cleanup_all_environments()
+                except Exception as _e:
+                    logger.debug("cleanup_all_environments (%s) error: %s", phase, _e)
+                try:
+                    from tools.browser_tool import cleanup_all_browsers
+                    cleanup_all_browsers()
+                except Exception as _e:
+                    logger.debug("cleanup_all_browsers (%s) error: %s", phase, _e)
+
            logger.info(
                "Stopping gateway%s...",
                " for restart" if self._restart_requested else "",
@ -2621,6 +2655,16 @@ class GatewayRunner:
                    self._update_runtime_status("draining")
                    await asyncio.sleep(0.1)

+                # Kill lingering tool subprocesses NOW, before we spend more
+                # budget on adapter disconnect / session DB close.  Under
+                # systemd (TimeoutStopSec bounded by drain_timeout+headroom),
+                # deferring this to the end of stop() risks systemd escalating
+                # to SIGKILL on the cgroup first — at which point bash/sleep
+                # children left behind by an interrupted terminal tool get
+                # killed by systemd instead of us (issue #8202).  The final
+                # catch-all cleanup below still runs for the graceful path.
+                _kill_tool_subprocesses("post-interrupt")
+
            if self._restart_requested and self._restart_detached:
                try:
                    await self._launch_detached_restart_command()
@ -2656,22 +2700,13 @@ class GatewayRunner:
            self._shutdown_event.set()

            # Global cleanup: kill any remaining tool subprocesses not tied
-            # to a specific agent (catch-all for zombie prevention).
-            try:
-                from tools.process_registry import process_registry
-                process_registry.kill_all()
-            except Exception:
-                pass
-            try:
-                from tools.terminal_tool import cleanup_all_environments
-                cleanup_all_environments()
-            except Exception:
-                pass
-            try:
-                from tools.browser_tool import cleanup_all_browsers
-                cleanup_all_browsers()
-            except Exception:
-                pass
+            # to a specific agent (catch-all for zombie prevention). On the
+            # drain-timeout path we already did this earlier after agent
+            # interrupt — this second call catches (a) the graceful path
+            # where drain succeeded without interrupt, and (b) anything
+            # that got respawned between the earlier call and adapter
+            # disconnect (defense in depth; safe to call repeatedly).
+            _kill_tool_subprocesses("final-cleanup")

            # Close SQLite session DBs so the WAL write lock is released.
            # Without this, --replace and similar restart flows leave the
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@ -1469,7 +1469,14 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
            path_entries.append(resolved_node_dir)

    common_bin_paths = ["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"]
-    restart_timeout = max(60, int(_get_restart_drain_timeout() or 0))
+    # systemd's TimeoutStopSec must exceed the gateway's drain_timeout so
+    # there's budget left for post-interrupt cleanup (tool subprocess kill,
+    # adapter disconnect, session DB close) before systemd escalates to
+    # SIGKILL on the cgroup — otherwise bash/sleep tool-call children left
+    # by a force-interrupted agent get reaped by systemd instead of us
+    # (#8202). 30s of headroom covers the worst case we've observed.
+    _drain_timeout = int(_get_restart_drain_timeout() or 0)
+    restart_timeout = max(60, _drain_timeout) + 30

    if system:
        username, group_name, home_dir = _system_service_identity(run_as_user)
--- a/tests/gateway/test_gateway_shutdown.py
+++ b/tests/gateway/test_gateway_shutdown.py
@ -145,3 +145,86 @@ async def test_drain_active_agents_throttles_status_updates():
    # Start, one count-change update, and final update. Allow one extra update
    # if the loop observes the zero-agent state before exiting.
    assert 3 <= runner._update_runtime_status.call_count <= 4
+
+
+@pytest.mark.asyncio
+async def test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout(monkeypatch):
+    """On drain timeout, tool subprocesses must be killed BEFORE adapter
+    disconnect so systemd's TimeoutStopSec doesn't SIGKILL the cgroup with
+    bash/sleep children still attached (#8202)."""
+    runner, adapter = make_restart_runner()
+    runner._restart_drain_timeout = 0.01  # force timeout path
+
+    call_order: list[str] = []
+
+    def _fake_kill_all(task_id=None):
+        call_order.append("kill_all")
+        return 2
+
+    def _fake_cleanup_envs():
+        call_order.append("cleanup_environments")
+
+    def _fake_cleanup_browsers():
+        call_order.append("cleanup_browsers")
+
+    async def _disconnect():
+        call_order.append("disconnect")
+
+    # Patch the module-level names the stop() helper imports lazily.
+    import tools.process_registry as _pr
+    import tools.terminal_tool as _tt
+    import tools.browser_tool as _bt
+    monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
+    monkeypatch.setattr(_tt, "cleanup_all_environments", _fake_cleanup_envs)
+    monkeypatch.setattr(_bt, "cleanup_all_browsers", _fake_cleanup_browsers)
+
+    adapter.disconnect = _disconnect
+
+    runner._running_agents = {"session": MagicMock()}
+
+    with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
+        await runner.stop()
+
+    # First kill_all must precede the first disconnect.  (Both the eager
+    # post-interrupt cleanup and the final catch-all call _kill_tool_
+    # subprocesses, so we expect kill_all to appear twice total.)
+    assert "kill_all" in call_order
+    assert "disconnect" in call_order
+    first_kill = call_order.index("kill_all")
+    first_disconnect = call_order.index("disconnect")
+    assert first_kill < first_disconnect, (
+        f"Tool subprocesses must be killed before adapter disconnect on "
+        f"drain timeout, got order: {call_order}"
+    )
+    # Defense-in-depth final cleanup still runs.
+    assert call_order.count("kill_all") >= 2
+
+
+@pytest.mark.asyncio
+async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch):
+    """Graceful shutdown (no drain timeout) must still kill tool subprocesses
+    exactly once via the final catch-all — regression guard against
+    accidentally removing that call when refactoring."""
+    runner, adapter = make_restart_runner()
+    adapter.disconnect = AsyncMock()
+
+    kill_count = 0
+
+    def _fake_kill_all(task_id=None):
+        nonlocal kill_count
+        kill_count += 1
+        return 0
+
+    import tools.process_registry as _pr
+    import tools.terminal_tool as _tt
+    import tools.browser_tool as _bt
+    monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
+    monkeypatch.setattr(_tt, "cleanup_all_environments", lambda: None)
+    monkeypatch.setattr(_bt, "cleanup_all_browsers", lambda: None)
+
+    # No running agents → drain returns immediately, no timeout, no eager cleanup.
+    with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
+        await runner.stop()
+
+    # Only the final catch-all fires on the graceful path.
+    assert kill_count == 1
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@ -95,7 +95,10 @@ class TestGeneratedSystemdUnits:
        assert "ExecStop=" not in unit
        assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
        assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        assert "TimeoutStopSec=60" in unit
+        # TimeoutStopSec must exceed the default drain_timeout (60s) so
+        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
+        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
+        assert "TimeoutStopSec=90" in unit

    def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
        monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
@ -111,7 +114,10 @@ class TestGeneratedSystemdUnits:
        assert "ExecStop=" not in unit
        assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
        assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        assert "TimeoutStopSec=60" in unit
+        # TimeoutStopSec must exceed the default drain_timeout (60s) so
+        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
+        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
+        assert "TimeoutStopSec=90" in unit
        assert "WantedBy=multi-user.target" in unit