From 327b57da91e5699564dd423bbb112ea95671b6e5 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 23 Apr 2026 13:59:29 -0700
Subject: [PATCH] fix(gateway): kill tool subprocesses before adapter
 disconnect on drain timeout (#14728)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #8202.

Root cause: stop() reclaimed tool-call bash/sleep children only at the
very end of the shutdown sequence — after a 60s drain, 5s interrupt
grace, and per-adapter disconnect. Under systemd (TimeoutStopSec bounded
by drain_timeout), that meant the cgroup SIGKILL escalation fired first,
and systemd reaped the bash/sleep children instead of us.

Fix:
- Extract tool-subprocess cleanup into a local helper
  _kill_tool_subprocesses() in _stop_impl().
- Invoke it eagerly right after _interrupt_running_agents() on the
  drain-timeout path, before adapter disconnect.
- Keep the existing catch-all call at the end for the graceful path
  and defense in depth against mid-teardown respawns.
- Bump generated systemd unit TimeoutStopSec to drain_timeout + 30s
  so cleanup + disconnect + DB close has headroom above the drain
  budget, matching the 'subprocess timeout > TimeoutStopSec + margin'
  rule from the skill.

Tests:
- New: test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout
  asserts kill_all() runs before disconnect() when drain times out.
- New: test_gateway_stop_kills_tool_subprocesses_on_graceful_path
  guards that the final catch-all still fires when drain succeeds
  (regression guard against accidental removal during refactor).
- Updated: existing systemd unit generator tests expect TimeoutStopSec=90
  (= 60s drain + 30s headroom) with explanatory comment.
---
 gateway/run.py                           | 67 ++++++++++++++-----
 hermes_cli/gateway.py                    |  9 ++-
 tests/gateway/test_gateway_shutdown.py   | 83 ++++++++++++++++++++++++
 tests/hermes_cli/test_gateway_service.py | 10 ++-
 4 files changed, 150 insertions(+), 19 deletions(-)

diff --git a/gateway/run.py b/gateway/run.py
index dcee18e51..2c377980c 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -2560,6 +2560,40 @@ class GatewayRunner:
             return
 
         async def _stop_impl() -> None:
+            def _kill_tool_subprocesses(phase: str) -> None:
+                """Kill tool subprocesses + tear down terminal envs + browsers.
+
+                Called twice in the shutdown path: once eagerly after a
+                drain timeout forces agent interrupt (so we reclaim bash/
+                sleep children before systemd TimeoutStopSec escalates to
+                SIGKILL on the cgroup — #8202), and once as a final
+                catch-all at the end of _stop_impl() for the graceful
+                path or anything respawned mid-teardown.
+
+                All steps are best-effort; exceptions are swallowed so
+                one subsystem's failure doesn't block the rest.
+                """
+                try:
+                    from tools.process_registry import process_registry
+                    _killed = process_registry.kill_all()
+                    if _killed:
+                        logger.info(
+                            "Shutdown (%s): killed %d tool subprocess(es)",
+                            phase, _killed,
+                        )
+                except Exception as _e:
+                    logger.debug("process_registry.kill_all (%s) error: %s", phase, _e)
+                try:
+                    from tools.terminal_tool import cleanup_all_environments
+                    cleanup_all_environments()
+                except Exception as _e:
+                    logger.debug("cleanup_all_environments (%s) error: %s", phase, _e)
+                try:
+                    from tools.browser_tool import cleanup_all_browsers
+                    cleanup_all_browsers()
+                except Exception as _e:
+                    logger.debug("cleanup_all_browsers (%s) error: %s", phase, _e)
+
             logger.info(
                 "Stopping gateway%s...",
                 " for restart" if self._restart_requested else "",
@@ -2621,6 +2655,16 @@ class GatewayRunner:
                     self._update_runtime_status("draining")
                     await asyncio.sleep(0.1)
 
+                # Kill lingering tool subprocesses NOW, before we spend more
+                # budget on adapter disconnect / session DB close.  Under
+                # systemd (TimeoutStopSec bounded by drain_timeout+headroom),
+                # deferring this to the end of stop() risks systemd escalating
+                # to SIGKILL on the cgroup first — at which point bash/sleep
+                # children left behind by an interrupted terminal tool get
+                # killed by systemd instead of us (issue #8202).  The final
+                # catch-all cleanup below still runs for the graceful path.
+                _kill_tool_subprocesses("post-interrupt")
+
             if self._restart_requested and self._restart_detached:
                 try:
                     await self._launch_detached_restart_command()
@@ -2656,22 +2700,13 @@ class GatewayRunner:
             self._shutdown_event.set()
 
             # Global cleanup: kill any remaining tool subprocesses not tied
-            # to a specific agent (catch-all for zombie prevention).
-            try:
-                from tools.process_registry import process_registry
-                process_registry.kill_all()
-            except Exception:
-                pass
-            try:
-                from tools.terminal_tool import cleanup_all_environments
-                cleanup_all_environments()
-            except Exception:
-                pass
-            try:
-                from tools.browser_tool import cleanup_all_browsers
-                cleanup_all_browsers()
-            except Exception:
-                pass
+            # to a specific agent (catch-all for zombie prevention). On the
+            # drain-timeout path we already did this earlier after agent
+            # interrupt — this second call catches (a) the graceful path
+            # where drain succeeded without interrupt, and (b) anything
+            # that got respawned between the earlier call and adapter
+            # disconnect (defense in depth; safe to call repeatedly).
+            _kill_tool_subprocesses("final-cleanup")
 
             # Close SQLite session DBs so the WAL write lock is released.
             # Without this, --replace and similar restart flows leave the
diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index 7796cc575..9773299d5 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -1469,7 +1469,14 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
             path_entries.append(resolved_node_dir)
 
     common_bin_paths = ["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"]
-    restart_timeout = max(60, int(_get_restart_drain_timeout() or 0))
+    # systemd's TimeoutStopSec must exceed the gateway's drain_timeout so
+    # there's budget left for post-interrupt cleanup (tool subprocess kill,
+    # adapter disconnect, session DB close) before systemd escalates to
+    # SIGKILL on the cgroup — otherwise bash/sleep tool-call children left
+    # by a force-interrupted agent get reaped by systemd instead of us
+    # (#8202). 30s of headroom covers the worst case we've observed.
+    _drain_timeout = int(_get_restart_drain_timeout() or 0)
+    restart_timeout = max(60, _drain_timeout) + 30
 
     if system:
         username, group_name, home_dir = _system_service_identity(run_as_user)
diff --git a/tests/gateway/test_gateway_shutdown.py b/tests/gateway/test_gateway_shutdown.py
index 4dc9919bc..137ddfd03 100644
--- a/tests/gateway/test_gateway_shutdown.py
+++ b/tests/gateway/test_gateway_shutdown.py
@@ -145,3 +145,86 @@ async def test_drain_active_agents_throttles_status_updates():
     # Start, one count-change update, and final update. Allow one extra update
     # if the loop observes the zero-agent state before exiting.
     assert 3 <= runner._update_runtime_status.call_count <= 4
+
+
+@pytest.mark.asyncio
+async def test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout(monkeypatch):
+    """On drain timeout, tool subprocesses must be killed BEFORE adapter
+    disconnect so systemd's TimeoutStopSec doesn't SIGKILL the cgroup with
+    bash/sleep children still attached (#8202)."""
+    runner, adapter = make_restart_runner()
+    runner._restart_drain_timeout = 0.01  # force timeout path
+
+    call_order: list[str] = []
+
+    def _fake_kill_all(task_id=None):
+        call_order.append("kill_all")
+        return 2
+
+    def _fake_cleanup_envs():
+        call_order.append("cleanup_environments")
+
+    def _fake_cleanup_browsers():
+        call_order.append("cleanup_browsers")
+
+    async def _disconnect():
+        call_order.append("disconnect")
+
+    # Patch the module-level names the stop() helper imports lazily.
+    import tools.process_registry as _pr
+    import tools.terminal_tool as _tt
+    import tools.browser_tool as _bt
+    monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
+    monkeypatch.setattr(_tt, "cleanup_all_environments", _fake_cleanup_envs)
+    monkeypatch.setattr(_bt, "cleanup_all_browsers", _fake_cleanup_browsers)
+
+    adapter.disconnect = _disconnect
+
+    runner._running_agents = {"session": MagicMock()}
+
+    with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
+        await runner.stop()
+
+    # First kill_all must precede the first disconnect.  (Both the eager
+    # post-interrupt cleanup and the final catch-all call _kill_tool_
+    # subprocesses, so we expect kill_all to appear twice total.)
+    assert "kill_all" in call_order
+    assert "disconnect" in call_order
+    first_kill = call_order.index("kill_all")
+    first_disconnect = call_order.index("disconnect")
+    assert first_kill < first_disconnect, (
+        f"Tool subprocesses must be killed before adapter disconnect on "
+        f"drain timeout, got order: {call_order}"
+    )
+    # Defense-in-depth final cleanup still runs.
+    assert call_order.count("kill_all") >= 2
+
+
+@pytest.mark.asyncio
+async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch):
+    """Graceful shutdown (no drain timeout) must still kill tool subprocesses
+    exactly once via the final catch-all — regression guard against
+    accidentally removing that call when refactoring."""
+    runner, adapter = make_restart_runner()
+    adapter.disconnect = AsyncMock()
+
+    kill_count = 0
+
+    def _fake_kill_all(task_id=None):
+        nonlocal kill_count
+        kill_count += 1
+        return 0
+
+    import tools.process_registry as _pr
+    import tools.terminal_tool as _tt
+    import tools.browser_tool as _bt
+    monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
+    monkeypatch.setattr(_tt, "cleanup_all_environments", lambda: None)
+    monkeypatch.setattr(_bt, "cleanup_all_browsers", lambda: None)
+
+    # No running agents → drain returns immediately, no timeout, no eager cleanup.
+    with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
+        await runner.stop()
+
+    # Only the final catch-all fires on the graceful path.
+    assert kill_count == 1
diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py
index 68554a496..bd429bff2 100644
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -95,7 +95,10 @@ class TestGeneratedSystemdUnits:
         assert "ExecStop=" not in unit
         assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
         assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        assert "TimeoutStopSec=60" in unit
+        # TimeoutStopSec must exceed the default drain_timeout (60s) so
+        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
+        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
+        assert "TimeoutStopSec=90" in unit
 
     def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
         monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
@@ -111,7 +114,10 @@ class TestGeneratedSystemdUnits:
         assert "ExecStop=" not in unit
         assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
         assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
-        assert "TimeoutStopSec=60" in unit
+        # TimeoutStopSec must exceed the default drain_timeout (60s) so
+        # systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
+        # (tool subprocess kill, adapter disconnect) runs — issue #8202.
+        assert "TimeoutStopSec=90" in unit
         assert "WantedBy=multi-user.target" in unit