From 3d029a53ec327fd7a628cde900bf300ca9a0cc0f Mon Sep 17 00:00:00 2001
From: Michael Steuer <michael@make.software>
Date: Mon, 8 Jun 2026 02:22:34 -0700
Subject: [PATCH] fix(gateway): close residual memory-leak sites under heavy
 scheduled workload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Long-lived gateways under heavy cron/build workloads grow steadily (~18 MB/hr
post-phantom-dispatch-fix) and eventually need a restart-or-OOM. Four retention
sites, all confirmed live on current main:

1. _evict_cached_agent() (/model, /reasoning, codex-runtime, /undo, etc.) popped
   the cache entry without releasing the agent's OpenAI client, httpx transport,
   SSL context, or conversation history. Only /new cleaned up first. Now releases
   clients on a daemon thread, matching _enforce_agent_cache_cap.

2. _release_evicted_agent_soft() now clears _session_messages after
   release_clients() — tool outputs (file reads, terminal output, search results)
   can be tens of MB per 100+-tool-call session; the list is rebuilt from
   persisted session JSON on resume, so dropping it on soft eviction is safe.

3. The session-expiry watcher (permanent finalization) now drops the session's
   per-session control dicts (_session_model_overrides, _session_reasoning_overrides,
   _pending_approvals, _update_prompt_pending, _pending_model_notes). These leaked
   one entry per session per gateway lifetime. NOTE: this is the session-finalize
   path, NOT idle agent-cache eviction — an idle-evicted session is still alive and
   rebuilds its agent from these overrides, so pruning them there would silently
   reset a user's /model choice.

4. _tool_defs_cache is now bounded (_TOOL_DEFS_CACHE_MAX=8) with oldest-first
   eviction instead of growing unboundedly across the distinct toolset/config
   fingerprints a gateway sees over its lifetime.

Salvaged from #25318 by Michael Steuer (@mssteuer); fix 3 redirected from the
idle-sweep to the session-finalize lifecycle, magic number 8 lifted to a named
constant, test ported.

Fixes #19251
Co-authored-by: Michael Steuer <michael@make.software>
---
 gateway/run.py                                | 42 +++++++++++++++++--
 model_tools.py                                | 13 ++++++
 ...st_get_tool_definitions_cache_isolation.py | 21 ++++++++++
 3 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/gateway/run.py b/gateway/run.py
index 8d512bf3b3c..739ba484909 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -5215,8 +5215,23 @@ class GatewayRunner(GatewayKanbanWatchersMixin, GatewaySlashCommandsMixin):
                         # be garbage-collected.  Otherwise the cache grows
                         # unbounded across the gateway's lifetime.
                         self._evict_cached_agent(key)
-                        # Mark as finalized and persist to disk so the flag
-                        # survives gateway restarts.
+                        # Permanently finalizing this session — drop its
+                        # per-session control state so the dicts don't grow
+                        # unbounded across the gateway's lifetime. (Idle
+                        # agent-cache eviction must NOT prune these: the
+                        # session is still alive and a resumed turn rebuilds
+                        # its agent from these overrides. Only true session
+                        # finalization, /new, and /reset clear them.)
+                        self._session_model_overrides.pop(key, None)
+                        self._set_session_reasoning_override(key, None)
+                        if hasattr(self, "_pending_model_notes"):
+                            self._pending_model_notes.pop(key, None)
+                        _pending_approvals = getattr(self, "_pending_approvals", None)
+                        if isinstance(_pending_approvals, dict):
+                            _pending_approvals.pop(key, None)
+                        _update_prompt_pending = getattr(self, "_update_prompt_pending", None)
+                        if isinstance(_update_prompt_pending, dict):
+                            _update_prompt_pending.pop(key, None)
                         with self.session_store._lock:
                             entry.expiry_finalized = True
                             self.session_store._save()
@@ -12482,7 +12497,21 @@ class GatewayRunner(GatewayKanbanWatchersMixin, GatewaySlashCommandsMixin):
         _lock = getattr(self, "_agent_cache_lock", None)
         if _lock:
             with _lock:
-                self._agent_cache.pop(session_key, None)
+                entry = self._agent_cache.pop(session_key, None)
+            # Release clients on a daemon thread, same as _enforce_agent_cache_cap.
+            # Without this, every /new, /model, /reasoning, codex-runtime change,
+            # and /undo leaked a full agent: OpenAI client, httpx transport, SSL
+            # context, and conversation history. Only the /new path cleaned up
+            # first; the rest popped the entry and dropped it on the floor.
+            if entry is not None:
+                agent = entry[0] if isinstance(entry, tuple) and entry else None
+                if agent is not None:
+                    threading.Thread(
+                        target=self._release_evicted_agent_soft,
+                        args=(agent,),
+                        daemon=True,
+                        name=f"agent-cache-cmd-evict-{session_key[:24]}",
+                    ).start()
 
     @staticmethod
     def _init_cached_agent_for_turn(agent: Any, interrupt_depth: int) -> None:
@@ -12524,6 +12553,13 @@ class GatewayRunner(GatewayKanbanWatchersMixin, GatewaySlashCommandsMixin):
                 self._cleanup_agent_resources(agent)
         except Exception:
             pass
+        # Free conversation history memory — can be tens of MB with tool
+        # outputs (file reads, terminal output, search results) on heavy
+        # 100+-tool-call sessions. release_clients() deliberately preserves
+        # session tool state for resume, but the message list is rebuilt from
+        # persisted session JSON on the next turn, so dropping it here is safe.
+        if hasattr(agent, "_session_messages"):
+            agent._session_messages = []
 
     def _enforce_agent_cache_cap(self) -> None:
         """Evict oldest cached agents when cache exceeds _AGENT_CACHE_MAX_SIZE.
diff --git a/model_tools.py b/model_tools.py
index 9d04ada2d75..22719a5daef 100644
--- a/model_tools.py
+++ b/model_tools.py
@@ -253,6 +253,14 @@ _LEGACY_TOOLSET_MAP = {
 # daemon start/stop, env var changes, etc.) on a 30 s horizon.
 _tool_defs_cache: Dict[tuple, List[Dict[str, Any]]] = {}
 
+# Hard cap on memoized get_tool_definitions() results. A long-lived Gateway
+# process sees many distinct toolset/config fingerprints over its lifetime
+# (per-session toolset sets, config edits, kanban-task toggles); without a
+# bound the cache grows unboundedly. 8 comfortably covers the warm working
+# set (the handful of distinct platform/toolset combos a gateway actually
+# serves) while keeping the cap small. (#19251)
+_TOOL_DEFS_CACHE_MAX = 8
+
 
 def _clear_tool_defs_cache() -> None:
     """Drop memoized get_tool_definitions() results. Called when dynamic
@@ -329,6 +337,11 @@ def get_tool_definitions(
         # agent inits and providers that enforce unique tool names
         # (DeepSeek, Xiaomi MiMo, Moonshot Kimi) reject the request with
         # HTTP 400. Mirrors the cache-hit path above. (issue #17335)
+        # Bound the cache with LRU eviction so a long-lived Gateway process
+        # doesn't accumulate entries unboundedly across the many distinct
+        # toolset/config fingerprints it sees over its lifetime (#19251).
+        if len(_tool_defs_cache) >= _TOOL_DEFS_CACHE_MAX:
+            _tool_defs_cache.pop(next(iter(_tool_defs_cache)))  # evict oldest
         _tool_defs_cache[cache_key] = result
         return list(result)
     return result
diff --git a/tests/test_get_tool_definitions_cache_isolation.py b/tests/test_get_tool_definitions_cache_isolation.py
index b92ef9dc454..bf131804e04 100644
--- a/tests/test_get_tool_definitions_cache_isolation.py
+++ b/tests/test_get_tool_definitions_cache_isolation.py
@@ -87,6 +87,27 @@ class TestQuietModeCacheIsolation:
             f"baseline={baseline}, final={len(final)}."
         )
 
+    def test_cache_bounded_by_eviction(self):
+        """The cache evicts the oldest entry when it reaches the cap,
+        keeping the cache bounded instead of growing unbounded over a
+        long-lived Gateway's lifetime (#19251)."""
+        cap = model_tools._TOOL_DEFS_CACHE_MAX
+        # Fill cache to the cap with distinct keys by varying enabled_toolsets.
+        for i in range(cap):
+            model_tools.get_tool_definitions(
+                enabled_toolsets=[f"fake_toolset_{i}"], quiet_mode=True,
+            )
+        assert len(model_tools._tool_defs_cache) == cap
+
+        # Adding one more must evict the oldest, not clear everything and
+        # not grow past the cap.
+        model_tools.get_tool_definitions(
+            enabled_toolsets=["fake_toolset_overflow"], quiet_mode=True,
+        )
+        assert len(model_tools._tool_defs_cache) == cap, (
+            "Eviction should keep the cache at the cap, not clear it or grow"
+        )
+
     def test_non_quiet_mode_does_not_use_cache(self):
         """Sanity: quiet_mode=False (TUI path) skips the cache entirely \u2014
         explains why the bug only hit Gateway."""