From fbe28352e49ed9cf34ab8c2b0d14ea48c993fd51 Mon Sep 17 00:00:00 2001 From: pefontana Date: Fri, 10 Apr 2026 16:22:59 -0300 Subject: [PATCH] fix(gateway): call agent.close() on session end to prevent zombies Wire AIAgent.close() into every gateway code path where an agent's session is actually ending: - stop(): close all running agents after interrupt + memory shutdown, then call cleanup_all_environments() and cleanup_all_browsers() as a global catch-all - _session_expiry_watcher(): close agents when sessions expire after the 5-minute idle timeout - _handle_reset_command(): close the old agent before evicting it from cache on /new or /reset Note: _evict_cached_agent() intentionally does NOT call close() because it is also used for non-destructive cache refreshes (model switch, branch, fallback) where tool resources should persist. Ref: #7131 --- gateway/run.py | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 659ba801369..694bbfe628f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1356,6 +1356,12 @@ class GatewayRunner: cached_agent.shutdown_memory_provider() except Exception: pass + # Close tool resources to prevent zombie processes + try: + if hasattr(cached_agent, 'close'): + cached_agent.close() + except Exception: + pass # Mark as flushed and persist to disk so the flag # survives gateway restarts. with self.session_store._lock: @@ -1536,6 +1542,14 @@ class GatewayRunner: agent.shutdown_memory_provider() except Exception: pass + # Close tool resources (terminal sandboxes, browser daemons, + # background processes, httpx clients) to prevent zombie + # process accumulation. + try: + if hasattr(agent, 'close'): + agent.close() + except Exception: + pass for platform, adapter in list(self.adapters.items()): try: @@ -1558,7 +1572,20 @@ class GatewayRunner: self._pending_messages.clear() self._pending_approvals.clear() self._shutdown_event.set() - + + # Global cleanup: kill any remaining tool subprocesses not tied + # to a specific agent (catch-all for zombie prevention). + try: + from tools.terminal_tool import cleanup_all_environments + cleanup_all_environments() + except Exception: + pass + try: + from tools.browser_tool import cleanup_all_browsers + cleanup_all_browsers() + except Exception: + pass + from gateway.status import remove_pid_file, write_runtime_status remove_pid_file() try: @@ -3335,8 +3362,21 @@ class GatewayRunner: _flush_task.add_done_callback(self._background_tasks.discard) except Exception as e: logger.debug("Gateway memory flush on reset failed: %s", e) + # Close tool resources on the old agent (terminal sandboxes, browser + # daemons, background processes) before evicting from cache. + _lock = getattr(self, "_agent_cache_lock", None) + if _lock: + with _lock: + _cached = self._agent_cache.get(session_key) + _old_agent = _cached[0] if isinstance(_cached, tuple) else _cached if _cached else None + if _old_agent is not None: + try: + if hasattr(_old_agent, "close"): + _old_agent.close() + except Exception: + pass self._evict_cached_agent(session_key) - + try: from tools.env_passthrough import clear_env_passthrough clear_env_passthrough()