From 9f67ba1b0182db31c0bcd08718f681a074373c16 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sun, 21 Jun 2026 07:25:42 -0700
Subject: [PATCH] fix(agent): guard finalize_turn cleanup chain so it never
 drops the response (#50009)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a turn hit max_iterations, finalize_turn ran three unguarded cleanup
steps after the model's summary — _save_trajectory (file I/O), _cleanup_task_resources
(remote VM/browser teardown), and _persist_session (SQLite write). Any raise
there propagated out of run_conversation, discarding the partial final_response
the caller was waiting for; subprocess wrappers saw an empty stdout with no
traceback (#8049).

Each step is now guarded independently so one failure can't skip the others.
Failures log at ERROR with a traceback and are surfaced on the result dict via
cleanup_errors; the partial response is always returned.

Closes #8049.
---
 agent/turn_finalizer.py                       |  38 +++-
 .../test_turn_finalizer_cleanup_guard.py      | 165 ++++++++++++++++++
 2 files changed, 199 insertions(+), 4 deletions(-)
 create mode 100644 tests/agent/test_turn_finalizer_cleanup_guard.py

diff --git a/agent/turn_finalizer.py b/agent/turn_finalizer.py
index 20db3fcef9f..91496d72040 100644
--- a/agent/turn_finalizer.py
+++ b/agent/turn_finalizer.py
@@ -128,19 +128,44 @@ def finalize_turn(
         and not failed
     )
 
+    # Post-loop cleanup must never lose the response.  Trajectory save,
+    # resource teardown, and session persistence all touch fallible
+    # surfaces — file I/O / JSON serialization (_save_trajectory), remote
+    # VM/browser teardown over the network (_cleanup_task_resources), and
+    # SQLite writes (_persist_session).  A raise from any of them used to
+    # propagate straight out of run_conversation, discarding the partial
+    # final_response the caller is waiting for (subprocess wrappers saw an
+    # empty stdout with no traceback — #8049).  Each step is now guarded
+    # independently so one failure can't skip the others, and any errors
+    # are surfaced on the result dict via ``cleanup_errors`` rather than
+    # killing the turn.
+    _cleanup_errors = []
+
     # Save trajectory if enabled.  ``user_message`` may be a multimodal
     # list of parts; the trajectory format wants a plain string.
-    agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed)
+    try:
+        agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed)
+    except Exception as _save_err:
+        _cleanup_errors.append(f"save_trajectory: {_save_err}")
+        logger.error("finalize_turn: _save_trajectory failed: %s", _save_err, exc_info=True)
 
     # Clean up VM and browser for this task after conversation completes
-    agent._cleanup_task_resources(effective_task_id)
+    try:
+        agent._cleanup_task_resources(effective_task_id)
+    except Exception as _cleanup_err:
+        _cleanup_errors.append(f"cleanup_task_resources: {_cleanup_err}")
+        logger.error("finalize_turn: _cleanup_task_resources failed: %s", _cleanup_err, exc_info=True)
 
     # Persist session to both JSON log and SQLite only after private retry
     # scaffolding has been removed. Otherwise a later user "continue" turn
     # can replay assistant("(empty)") / recovery nudges and fall into the
     # same empty-response loop again.
-    agent._drop_trailing_empty_response_scaffolding(messages)
-    agent._persist_session(messages, conversation_history)
+    try:
+        agent._drop_trailing_empty_response_scaffolding(messages)
+        agent._persist_session(messages, conversation_history)
+    except Exception as _persist_err:
+        _cleanup_errors.append(f"persist_session: {_persist_err}")
+        logger.error("finalize_turn: _persist_session failed: %s", _persist_err, exc_info=True)
 
     # ── Turn-exit diagnostic log ─────────────────────────────────────
     # Always logged at INFO so agent.log captures WHY every turn ended.
@@ -354,6 +379,11 @@ def finalize_turn(
     }
     if agent._tool_guardrail_halt_decision is not None:
         result["guardrail"] = agent._tool_guardrail_halt_decision.to_metadata()
+    # Surface any post-loop cleanup failures so the caller can distinguish a
+    # clean turn from one whose trajectory/session/resource teardown raised
+    # (the response is still returned either way — #8049).
+    if _cleanup_errors:
+        result["cleanup_errors"] = _cleanup_errors
     # If a /steer landed after the final assistant turn (no more tool
     # batches to drain into), hand it back to the caller so it can be
     # delivered as the next user turn instead of being silently lost.
diff --git a/tests/agent/test_turn_finalizer_cleanup_guard.py b/tests/agent/test_turn_finalizer_cleanup_guard.py
new file mode 100644
index 00000000000..e988501dc8e
--- /dev/null
+++ b/tests/agent/test_turn_finalizer_cleanup_guard.py
@@ -0,0 +1,165 @@
+"""Regression test for #8049.
+
+When the post-loop cleanup chain in ``finalize_turn`` raises — trajectory
+save (file I/O), resource teardown (remote VM/browser), or session
+persistence (SQLite) — the partial ``final_response`` the caller is waiting
+for must still be returned.  Previously any of those raised straight out of
+``run_conversation``, so a subprocess wrapper saw an empty stdout with no
+traceback and lost the whole turn.
+"""
+
+import pytest
+
+from agent.turn_finalizer import finalize_turn
+
+
+class _StubBudget:
+    used = 5
+    max_total = 3
+    remaining = 0
+
+
+class _StubCompressor:
+    last_prompt_tokens = 0
+
+
+class _StubAgent:
+    """Minimal agent surface that ``finalize_turn`` reads from."""
+
+    def __init__(self, *, raise_in):
+        self._raise_in = set(raise_in)
+        self.max_iterations = 3
+        self.iteration_budget = _StubBudget()
+        self.context_compressor = _StubCompressor()
+        self.model = "stub/model"
+        self.provider = "stub"
+        self.base_url = "http://stub"
+        self.session_id = "sess-1"
+        self.quiet_mode = True
+        self.platform = "cli"
+        self._interrupt_requested = False
+        self._interrupt_message = None
+        self._tool_guardrail_halt_decision = None
+        self._response_was_previewed = False
+        self._skill_nudge_interval = 0
+        self._iters_since_skill = 0
+        for attr in (
+            "session_input_tokens",
+            "session_output_tokens",
+            "session_cache_read_tokens",
+            "session_cache_write_tokens",
+            "session_reasoning_tokens",
+            "session_prompt_tokens",
+            "session_completion_tokens",
+            "session_total_tokens",
+            "session_estimated_cost_usd",
+        ):
+            setattr(self, attr, 0)
+        self.session_cost_status = "ok"
+        self.session_cost_source = "stub"
+
+    # --- fallible cleanup surfaces -------------------------------------
+    def _save_trajectory(self, *a, **k):
+        if "save_trajectory" in self._raise_in:
+            raise RuntimeError("trajectory disk full")
+
+    def _cleanup_task_resources(self, *a, **k):
+        if "cleanup_task_resources" in self._raise_in:
+            raise RuntimeError("docker teardown EOF")
+
+    def _drop_trailing_empty_response_scaffolding(self, *a, **k):
+        pass
+
+    def _persist_session(self, *a, **k):
+        if "persist_session" in self._raise_in:
+            raise RuntimeError("sqlite database is locked")
+
+    # --- harmless no-ops ------------------------------------------------
+    def _emit_status(self, *a, **k):
+        pass
+
+    def _safe_print(self, *a, **k):
+        pass
+
+    def _handle_max_iterations(self, messages, n):
+        return "PARTIAL SUMMARY FROM MODEL"
+
+    def _file_mutation_verifier_enabled(self):
+        return False
+
+    def _turn_completion_explainer_enabled(self):
+        return False
+
+    def _drain_pending_steer(self):
+        return None
+
+    def clear_interrupt(self):
+        pass
+
+    def _sync_external_memory_for_turn(self, **k):
+        pass
+
+
+def _run(agent):
+    messages = [
+        {"role": "user", "content": "do a thing"},
+        {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [
+                {"id": "c1", "function": {"name": "read_file", "arguments": "{}"}}
+            ],
+        },
+        {"role": "tool", "tool_call_id": "c1", "content": "file contents"},
+    ]
+    return finalize_turn(
+        agent,
+        final_response=None,  # forces the max-iterations summary path
+        api_call_count=3,
+        interrupted=False,
+        failed=False,
+        messages=messages,
+        conversation_history=None,
+        effective_task_id="task-1",
+        turn_id="turn-1",
+        user_message="do a thing",
+        original_user_message="do a thing",
+        _should_review_memory=False,
+        _turn_exit_reason="unknown",
+    )
+
+
+def test_all_cleanup_steps_raise_response_still_returned():
+    agent = _StubAgent(
+        raise_in=("save_trajectory", "cleanup_task_resources", "persist_session")
+    )
+    result = _run(agent)
+    assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL"
+    labels = [e.split(":")[0] for e in result["cleanup_errors"]]
+    assert labels == ["save_trajectory", "cleanup_task_resources", "persist_session"]
+
+
+@pytest.mark.parametrize(
+    "step", ["save_trajectory", "cleanup_task_resources", "persist_session"]
+)
+def test_single_cleanup_step_raises_does_not_skip_others(step):
+    agent = _StubAgent(raise_in=(step,))
+    result = _run(agent)
+    # Response survives.
+    assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL"
+    # Exactly the failing step is recorded; the others ran without error.
+    assert result["cleanup_errors"] == [
+        next(
+            e
+            for e in result["cleanup_errors"]
+            if e.startswith(step)
+        )
+    ]
+    assert len(result["cleanup_errors"]) == 1
+
+
+def test_clean_turn_has_no_cleanup_errors_key():
+    agent = _StubAgent(raise_in=())
+    result = _run(agent)
+    assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL"
+    assert "cleanup_errors" not in result