mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): kill tool subprocesses before adapter disconnect on drain timeout (#14728)
Closes #8202. Root cause: stop() reclaimed tool-call bash/sleep children only at the very end of the shutdown sequence — after a 60s drain, 5s interrupt grace, and per-adapter disconnect. Under systemd (TimeoutStopSec bounded by drain_timeout), that meant the cgroup SIGKILL escalation fired first, and systemd reaped the bash/sleep children instead of us. Fix: - Extract tool-subprocess cleanup into a local helper _kill_tool_subprocesses() in _stop_impl(). - Invoke it eagerly right after _interrupt_running_agents() on the drain-timeout path, before adapter disconnect. - Keep the existing catch-all call at the end for the graceful path and defense in depth against mid-teardown respawns. - Bump generated systemd unit TimeoutStopSec to drain_timeout + 30s so cleanup + disconnect + DB close has headroom above the drain budget, matching the 'subprocess timeout > TimeoutStopSec + margin' rule from the skill. Tests: - New: test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout asserts kill_all() runs before disconnect() when drain times out. - New: test_gateway_stop_kills_tool_subprocesses_on_graceful_path guards that the final catch-all still fires when drain succeeds (regression guard against accidental removal during refactor). - Updated: existing systemd unit generator tests expect TimeoutStopSec=90 (= 60s drain + 30s headroom) with explanatory comment.
This commit is contained in:
parent
64e6165686
commit
327b57da91
4 changed files with 150 additions and 19 deletions
|
|
@ -2560,6 +2560,40 @@ class GatewayRunner:
|
|||
return
|
||||
|
||||
async def _stop_impl() -> None:
|
||||
def _kill_tool_subprocesses(phase: str) -> None:
|
||||
"""Kill tool subprocesses + tear down terminal envs + browsers.
|
||||
|
||||
Called twice in the shutdown path: once eagerly after a
|
||||
drain timeout forces agent interrupt (so we reclaim bash/
|
||||
sleep children before systemd TimeoutStopSec escalates to
|
||||
SIGKILL on the cgroup — #8202), and once as a final
|
||||
catch-all at the end of _stop_impl() for the graceful
|
||||
path or anything respawned mid-teardown.
|
||||
|
||||
All steps are best-effort; exceptions are swallowed so
|
||||
one subsystem's failure doesn't block the rest.
|
||||
"""
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
_killed = process_registry.kill_all()
|
||||
if _killed:
|
||||
logger.info(
|
||||
"Shutdown (%s): killed %d tool subprocess(es)",
|
||||
phase, _killed,
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("process_registry.kill_all (%s) error: %s", phase, _e)
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception as _e:
|
||||
logger.debug("cleanup_all_environments (%s) error: %s", phase, _e)
|
||||
try:
|
||||
from tools.browser_tool import cleanup_all_browsers
|
||||
cleanup_all_browsers()
|
||||
except Exception as _e:
|
||||
logger.debug("cleanup_all_browsers (%s) error: %s", phase, _e)
|
||||
|
||||
logger.info(
|
||||
"Stopping gateway%s...",
|
||||
" for restart" if self._restart_requested else "",
|
||||
|
|
@ -2621,6 +2655,16 @@ class GatewayRunner:
|
|||
self._update_runtime_status("draining")
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Kill lingering tool subprocesses NOW, before we spend more
|
||||
# budget on adapter disconnect / session DB close. Under
|
||||
# systemd (TimeoutStopSec bounded by drain_timeout+headroom),
|
||||
# deferring this to the end of stop() risks systemd escalating
|
||||
# to SIGKILL on the cgroup first — at which point bash/sleep
|
||||
# children left behind by an interrupted terminal tool get
|
||||
# killed by systemd instead of us (issue #8202). The final
|
||||
# catch-all cleanup below still runs for the graceful path.
|
||||
_kill_tool_subprocesses("post-interrupt")
|
||||
|
||||
if self._restart_requested and self._restart_detached:
|
||||
try:
|
||||
await self._launch_detached_restart_command()
|
||||
|
|
@ -2656,22 +2700,13 @@ class GatewayRunner:
|
|||
self._shutdown_event.set()
|
||||
|
||||
# Global cleanup: kill any remaining tool subprocesses not tied
|
||||
# to a specific agent (catch-all for zombie prevention).
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
process_registry.kill_all()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from tools.terminal_tool import cleanup_all_environments
|
||||
cleanup_all_environments()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from tools.browser_tool import cleanup_all_browsers
|
||||
cleanup_all_browsers()
|
||||
except Exception:
|
||||
pass
|
||||
# to a specific agent (catch-all for zombie prevention). On the
|
||||
# drain-timeout path we already did this earlier after agent
|
||||
# interrupt — this second call catches (a) the graceful path
|
||||
# where drain succeeded without interrupt, and (b) anything
|
||||
# that got respawned between the earlier call and adapter
|
||||
# disconnect (defense in depth; safe to call repeatedly).
|
||||
_kill_tool_subprocesses("final-cleanup")
|
||||
|
||||
# Close SQLite session DBs so the WAL write lock is released.
|
||||
# Without this, --replace and similar restart flows leave the
|
||||
|
|
|
|||
|
|
@ -1469,7 +1469,14 @@ def generate_systemd_unit(system: bool = False, run_as_user: str | None = None)
|
|||
path_entries.append(resolved_node_dir)
|
||||
|
||||
common_bin_paths = ["/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"]
|
||||
restart_timeout = max(60, int(_get_restart_drain_timeout() or 0))
|
||||
# systemd's TimeoutStopSec must exceed the gateway's drain_timeout so
|
||||
# there's budget left for post-interrupt cleanup (tool subprocess kill,
|
||||
# adapter disconnect, session DB close) before systemd escalates to
|
||||
# SIGKILL on the cgroup — otherwise bash/sleep tool-call children left
|
||||
# by a force-interrupted agent get reaped by systemd instead of us
|
||||
# (#8202). 30s of headroom covers the worst case we've observed.
|
||||
_drain_timeout = int(_get_restart_drain_timeout() or 0)
|
||||
restart_timeout = max(60, _drain_timeout) + 30
|
||||
|
||||
if system:
|
||||
username, group_name, home_dir = _system_service_identity(run_as_user)
|
||||
|
|
|
|||
|
|
@ -145,3 +145,86 @@ async def test_drain_active_agents_throttles_status_updates():
|
|||
# Start, one count-change update, and final update. Allow one extra update
|
||||
# if the loop observes the zero-agent state before exiting.
|
||||
assert 3 <= runner._update_runtime_status.call_count <= 4
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gateway_stop_kills_tool_subprocesses_before_adapter_disconnect_on_timeout(monkeypatch):
|
||||
"""On drain timeout, tool subprocesses must be killed BEFORE adapter
|
||||
disconnect so systemd's TimeoutStopSec doesn't SIGKILL the cgroup with
|
||||
bash/sleep children still attached (#8202)."""
|
||||
runner, adapter = make_restart_runner()
|
||||
runner._restart_drain_timeout = 0.01 # force timeout path
|
||||
|
||||
call_order: list[str] = []
|
||||
|
||||
def _fake_kill_all(task_id=None):
|
||||
call_order.append("kill_all")
|
||||
return 2
|
||||
|
||||
def _fake_cleanup_envs():
|
||||
call_order.append("cleanup_environments")
|
||||
|
||||
def _fake_cleanup_browsers():
|
||||
call_order.append("cleanup_browsers")
|
||||
|
||||
async def _disconnect():
|
||||
call_order.append("disconnect")
|
||||
|
||||
# Patch the module-level names the stop() helper imports lazily.
|
||||
import tools.process_registry as _pr
|
||||
import tools.terminal_tool as _tt
|
||||
import tools.browser_tool as _bt
|
||||
monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
|
||||
monkeypatch.setattr(_tt, "cleanup_all_environments", _fake_cleanup_envs)
|
||||
monkeypatch.setattr(_bt, "cleanup_all_browsers", _fake_cleanup_browsers)
|
||||
|
||||
adapter.disconnect = _disconnect
|
||||
|
||||
runner._running_agents = {"session": MagicMock()}
|
||||
|
||||
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
|
||||
await runner.stop()
|
||||
|
||||
# First kill_all must precede the first disconnect. (Both the eager
|
||||
# post-interrupt cleanup and the final catch-all call _kill_tool_
|
||||
# subprocesses, so we expect kill_all to appear twice total.)
|
||||
assert "kill_all" in call_order
|
||||
assert "disconnect" in call_order
|
||||
first_kill = call_order.index("kill_all")
|
||||
first_disconnect = call_order.index("disconnect")
|
||||
assert first_kill < first_disconnect, (
|
||||
f"Tool subprocesses must be killed before adapter disconnect on "
|
||||
f"drain timeout, got order: {call_order}"
|
||||
)
|
||||
# Defense-in-depth final cleanup still runs.
|
||||
assert call_order.count("kill_all") >= 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch):
|
||||
"""Graceful shutdown (no drain timeout) must still kill tool subprocesses
|
||||
exactly once via the final catch-all — regression guard against
|
||||
accidentally removing that call when refactoring."""
|
||||
runner, adapter = make_restart_runner()
|
||||
adapter.disconnect = AsyncMock()
|
||||
|
||||
kill_count = 0
|
||||
|
||||
def _fake_kill_all(task_id=None):
|
||||
nonlocal kill_count
|
||||
kill_count += 1
|
||||
return 0
|
||||
|
||||
import tools.process_registry as _pr
|
||||
import tools.terminal_tool as _tt
|
||||
import tools.browser_tool as _bt
|
||||
monkeypatch.setattr(_pr.process_registry, "kill_all", _fake_kill_all)
|
||||
monkeypatch.setattr(_tt, "cleanup_all_environments", lambda: None)
|
||||
monkeypatch.setattr(_bt, "cleanup_all_browsers", lambda: None)
|
||||
|
||||
# No running agents → drain returns immediately, no timeout, no eager cleanup.
|
||||
with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"):
|
||||
await runner.stop()
|
||||
|
||||
# Only the final catch-all fires on the graceful path.
|
||||
assert kill_count == 1
|
||||
|
|
|
|||
|
|
@ -95,7 +95,10 @@ class TestGeneratedSystemdUnits:
|
|||
assert "ExecStop=" not in unit
|
||||
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
|
||||
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
|
||||
assert "TimeoutStopSec=60" in unit
|
||||
# TimeoutStopSec must exceed the default drain_timeout (60s) so
|
||||
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
|
||||
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
|
||||
assert "TimeoutStopSec=90" in unit
|
||||
|
||||
def test_user_unit_includes_resolved_node_directory_in_path(self, monkeypatch):
|
||||
monkeypatch.setattr(gateway_cli.shutil, "which", lambda cmd: "/home/test/.nvm/versions/node/v24.14.0/bin/node" if cmd == "node" else None)
|
||||
|
|
@ -111,7 +114,10 @@ class TestGeneratedSystemdUnits:
|
|||
assert "ExecStop=" not in unit
|
||||
assert "ExecReload=/bin/kill -USR1 $MAINPID" in unit
|
||||
assert f"RestartForceExitStatus={GATEWAY_SERVICE_RESTART_EXIT_CODE}" in unit
|
||||
assert "TimeoutStopSec=60" in unit
|
||||
# TimeoutStopSec must exceed the default drain_timeout (60s) so
|
||||
# systemd doesn't SIGKILL the cgroup before post-interrupt cleanup
|
||||
# (tool subprocess kill, adapter disconnect) runs — issue #8202.
|
||||
assert "TimeoutStopSec=90" in unit
|
||||
assert "WantedBy=multi-user.target" in unit
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue