From 3e3ec35a5e0235184bb5c11e3404dd3d51458af0 Mon Sep 17 00:00:00 2001
From: konsisumer <der@konsi.org>
Date: Thu, 16 Apr 2026 09:17:24 +0200
Subject: [PATCH] fix: surface execute_code timeout to user instead of silently
 dropping (#10807)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When execute_code times out, the result JSON had status="timeout" and an
error field, but the output field was empty.  Many models treat empty
output as "nothing happened" and produce an empty/minimal response.  The
gateway stream consumer then considers the response "already sent" (from
pre-tool streaming) and silently drops it — leaving the user staring at
silence.

Three changes:

1. Include the timeout message in the output field (both local and remote
   paths) so the model always has visible content to relay to the user.

2. Add periodic activity callbacks to the local execution polling loop so
   the gateway's inactivity monitor knows execute_code is alive during
   long runs.

3. Fix stream_consumer._send_fallback_final to not silently drop content
   when the continuation appears empty but the final text differs from
   what was previously streamed (e.g. after a tool boundary reset).
---
 gateway/stream_consumer.py   | 14 +++++++++---
 tools/code_execution_tool.py | 43 ++++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/gateway/stream_consumer.py b/gateway/stream_consumer.py
index a644547e6a..853b159034 100644
--- a/gateway/stream_consumer.py
+++ b/gateway/stream_consumer.py
@@ -515,9 +515,17 @@ class GatewayStreamConsumer:
         self._fallback_final_send = False
         if not continuation.strip():
             # Nothing new to send — the visible partial already matches final text.
-            self._already_sent = True
-            self._final_response_sent = True
-            return
+            # BUT: if final_text itself has meaningful content (e.g. a timeout
+            # message after a long tool call), the prefix-based continuation
+            # calculation may wrongly conclude "already shown" because the
+            # streamed prefix was from a *previous* segment (before the tool
+            # boundary).  In that case, send the full final_text as-is (#10807).
+            if final_text.strip() and final_text != self._visible_prefix():
+                continuation = final_text
+            else:
+                self._already_sent = True
+                self._final_response_sent = True
+                return
 
         raw_limit = getattr(self.adapter, "MAX_MESSAGE_LENGTH", 4096)
         safe_limit = max(500, raw_limit - 100)
diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py
index 8cffeda804..d61164bcac 100644
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@@ -871,7 +871,18 @@ def _execute_remote(
     }
 
     if status == "timeout":
-        result["error"] = f"Script timed out after {timeout}s and was killed."
+        timeout_msg = f"Script timed out after {timeout}s and was killed."
+        result["error"] = timeout_msg
+        # Include timeout message in output so the LLM always surfaces it
+        # to the user (see local path comment — same reasoning, #10807).
+        if stdout_text:
+            result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}"
+        else:
+            result["output"] = f"⏰ {timeout_msg}"
+        logger.warning(
+            "execute_code (remote) timed out after %ss (limit %ss) with %d tool calls",
+            duration, timeout, tool_call_counter[0],
+        )
     elif status == "interrupted":
         result["output"] = (
             stdout_text + "\n[execution interrupted — user sent a new message]"
@@ -1117,6 +1128,8 @@ def execute_code(
         stderr_reader.start()
 
         status = "success"
+        _last_activity_touch = time.monotonic()
+        _ACTIVITY_INTERVAL = 10.0
         while proc.poll() is None:
             if _is_interrupted():
                 _kill_process_group(proc)
@@ -1126,6 +1139,19 @@ def execute_code(
                 _kill_process_group(proc, escalate=True)
                 status = "timeout"
                 break
+            # Periodic activity touch so the gateway's inactivity timeout
+            # doesn't kill the agent during long code execution (#10807).
+            _now = time.monotonic()
+            if _now - _last_activity_touch >= _ACTIVITY_INTERVAL:
+                _last_activity_touch = _now
+                try:
+                    from tools.environments.base import _get_activity_callback
+                    _cb = _get_activity_callback()
+                    if _cb:
+                        _elapsed = int(_now - exec_start)
+                        _cb(f"execute_code running ({_elapsed}s elapsed)")
+                except Exception:
+                    pass
             time.sleep(0.2)
 
         # Wait for readers to finish draining
@@ -1179,7 +1205,20 @@ def execute_code(
         }
 
         if status == "timeout":
-            result["error"] = f"Script timed out after {timeout}s and was killed."
+            timeout_msg = f"Script timed out after {timeout}s and was killed."
+            result["error"] = timeout_msg
+            # Include timeout message in output so the LLM always surfaces it
+            # to the user.  When output is empty, models often treat the result
+            # as "nothing happened" and produce an empty response, which the
+            # gateway stream consumer silently drops (#10807).
+            if stdout_text:
+                result["output"] = stdout_text + f"\n\n⏰ {timeout_msg}"
+            else:
+                result["output"] = f"⏰ {timeout_msg}"
+            logger.warning(
+                "execute_code timed out after %ss (limit %ss) with %d tool calls",
+                duration, timeout, tool_call_counter[0],
+            )
         elif status == "interrupted":
             result["output"] = stdout_text + "\n[execution interrupted — user sent a new message]"
         elif exit_code != 0: