From 9f67ba1b0182db31c0bcd08718f681a074373c16 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 21 Jun 2026 07:25:42 -0700 Subject: [PATCH] fix(agent): guard finalize_turn cleanup chain so it never drops the response (#50009) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a turn hit max_iterations, finalize_turn ran three unguarded cleanup steps after the model's summary — _save_trajectory (file I/O), _cleanup_task_resources (remote VM/browser teardown), and _persist_session (SQLite write). Any raise there propagated out of run_conversation, discarding the partial final_response the caller was waiting for; subprocess wrappers saw an empty stdout with no traceback (#8049). Each step is now guarded independently so one failure can't skip the others. Failures log at ERROR with a traceback and are surfaced on the result dict via cleanup_errors; the partial response is always returned. Closes #8049. --- agent/turn_finalizer.py | 38 +++- .../test_turn_finalizer_cleanup_guard.py | 165 ++++++++++++++++++ 2 files changed, 199 insertions(+), 4 deletions(-) create mode 100644 tests/agent/test_turn_finalizer_cleanup_guard.py diff --git a/agent/turn_finalizer.py b/agent/turn_finalizer.py index 20db3fcef9f..91496d72040 100644 --- a/agent/turn_finalizer.py +++ b/agent/turn_finalizer.py @@ -128,19 +128,44 @@ def finalize_turn( and not failed ) + # Post-loop cleanup must never lose the response. Trajectory save, + # resource teardown, and session persistence all touch fallible + # surfaces — file I/O / JSON serialization (_save_trajectory), remote + # VM/browser teardown over the network (_cleanup_task_resources), and + # SQLite writes (_persist_session). A raise from any of them used to + # propagate straight out of run_conversation, discarding the partial + # final_response the caller is waiting for (subprocess wrappers saw an + # empty stdout with no traceback — #8049). Each step is now guarded + # independently so one failure can't skip the others, and any errors + # are surfaced on the result dict via ``cleanup_errors`` rather than + # killing the turn. + _cleanup_errors = [] + # Save trajectory if enabled. ``user_message`` may be a multimodal # list of parts; the trajectory format wants a plain string. - agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) + try: + agent._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) + except Exception as _save_err: + _cleanup_errors.append(f"save_trajectory: {_save_err}") + logger.error("finalize_turn: _save_trajectory failed: %s", _save_err, exc_info=True) # Clean up VM and browser for this task after conversation completes - agent._cleanup_task_resources(effective_task_id) + try: + agent._cleanup_task_resources(effective_task_id) + except Exception as _cleanup_err: + _cleanup_errors.append(f"cleanup_task_resources: {_cleanup_err}") + logger.error("finalize_turn: _cleanup_task_resources failed: %s", _cleanup_err, exc_info=True) # Persist session to both JSON log and SQLite only after private retry # scaffolding has been removed. Otherwise a later user "continue" turn # can replay assistant("(empty)") / recovery nudges and fall into the # same empty-response loop again. - agent._drop_trailing_empty_response_scaffolding(messages) - agent._persist_session(messages, conversation_history) + try: + agent._drop_trailing_empty_response_scaffolding(messages) + agent._persist_session(messages, conversation_history) + except Exception as _persist_err: + _cleanup_errors.append(f"persist_session: {_persist_err}") + logger.error("finalize_turn: _persist_session failed: %s", _persist_err, exc_info=True) # ── Turn-exit diagnostic log ───────────────────────────────────── # Always logged at INFO so agent.log captures WHY every turn ended. @@ -354,6 +379,11 @@ def finalize_turn( } if agent._tool_guardrail_halt_decision is not None: result["guardrail"] = agent._tool_guardrail_halt_decision.to_metadata() + # Surface any post-loop cleanup failures so the caller can distinguish a + # clean turn from one whose trajectory/session/resource teardown raised + # (the response is still returned either way — #8049). + if _cleanup_errors: + result["cleanup_errors"] = _cleanup_errors # If a /steer landed after the final assistant turn (no more tool # batches to drain into), hand it back to the caller so it can be # delivered as the next user turn instead of being silently lost. diff --git a/tests/agent/test_turn_finalizer_cleanup_guard.py b/tests/agent/test_turn_finalizer_cleanup_guard.py new file mode 100644 index 00000000000..e988501dc8e --- /dev/null +++ b/tests/agent/test_turn_finalizer_cleanup_guard.py @@ -0,0 +1,165 @@ +"""Regression test for #8049. + +When the post-loop cleanup chain in ``finalize_turn`` raises — trajectory +save (file I/O), resource teardown (remote VM/browser), or session +persistence (SQLite) — the partial ``final_response`` the caller is waiting +for must still be returned. Previously any of those raised straight out of +``run_conversation``, so a subprocess wrapper saw an empty stdout with no +traceback and lost the whole turn. +""" + +import pytest + +from agent.turn_finalizer import finalize_turn + + +class _StubBudget: + used = 5 + max_total = 3 + remaining = 0 + + +class _StubCompressor: + last_prompt_tokens = 0 + + +class _StubAgent: + """Minimal agent surface that ``finalize_turn`` reads from.""" + + def __init__(self, *, raise_in): + self._raise_in = set(raise_in) + self.max_iterations = 3 + self.iteration_budget = _StubBudget() + self.context_compressor = _StubCompressor() + self.model = "stub/model" + self.provider = "stub" + self.base_url = "http://stub" + self.session_id = "sess-1" + self.quiet_mode = True + self.platform = "cli" + self._interrupt_requested = False + self._interrupt_message = None + self._tool_guardrail_halt_decision = None + self._response_was_previewed = False + self._skill_nudge_interval = 0 + self._iters_since_skill = 0 + for attr in ( + "session_input_tokens", + "session_output_tokens", + "session_cache_read_tokens", + "session_cache_write_tokens", + "session_reasoning_tokens", + "session_prompt_tokens", + "session_completion_tokens", + "session_total_tokens", + "session_estimated_cost_usd", + ): + setattr(self, attr, 0) + self.session_cost_status = "ok" + self.session_cost_source = "stub" + + # --- fallible cleanup surfaces ------------------------------------- + def _save_trajectory(self, *a, **k): + if "save_trajectory" in self._raise_in: + raise RuntimeError("trajectory disk full") + + def _cleanup_task_resources(self, *a, **k): + if "cleanup_task_resources" in self._raise_in: + raise RuntimeError("docker teardown EOF") + + def _drop_trailing_empty_response_scaffolding(self, *a, **k): + pass + + def _persist_session(self, *a, **k): + if "persist_session" in self._raise_in: + raise RuntimeError("sqlite database is locked") + + # --- harmless no-ops ------------------------------------------------ + def _emit_status(self, *a, **k): + pass + + def _safe_print(self, *a, **k): + pass + + def _handle_max_iterations(self, messages, n): + return "PARTIAL SUMMARY FROM MODEL" + + def _file_mutation_verifier_enabled(self): + return False + + def _turn_completion_explainer_enabled(self): + return False + + def _drain_pending_steer(self): + return None + + def clear_interrupt(self): + pass + + def _sync_external_memory_for_turn(self, **k): + pass + + +def _run(agent): + messages = [ + {"role": "user", "content": "do a thing"}, + { + "role": "assistant", + "content": "", + "tool_calls": [ + {"id": "c1", "function": {"name": "read_file", "arguments": "{}"}} + ], + }, + {"role": "tool", "tool_call_id": "c1", "content": "file contents"}, + ] + return finalize_turn( + agent, + final_response=None, # forces the max-iterations summary path + api_call_count=3, + interrupted=False, + failed=False, + messages=messages, + conversation_history=None, + effective_task_id="task-1", + turn_id="turn-1", + user_message="do a thing", + original_user_message="do a thing", + _should_review_memory=False, + _turn_exit_reason="unknown", + ) + + +def test_all_cleanup_steps_raise_response_still_returned(): + agent = _StubAgent( + raise_in=("save_trajectory", "cleanup_task_resources", "persist_session") + ) + result = _run(agent) + assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL" + labels = [e.split(":")[0] for e in result["cleanup_errors"]] + assert labels == ["save_trajectory", "cleanup_task_resources", "persist_session"] + + +@pytest.mark.parametrize( + "step", ["save_trajectory", "cleanup_task_resources", "persist_session"] +) +def test_single_cleanup_step_raises_does_not_skip_others(step): + agent = _StubAgent(raise_in=(step,)) + result = _run(agent) + # Response survives. + assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL" + # Exactly the failing step is recorded; the others ran without error. + assert result["cleanup_errors"] == [ + next( + e + for e in result["cleanup_errors"] + if e.startswith(step) + ) + ] + assert len(result["cleanup_errors"]) == 1 + + +def test_clean_turn_has_no_cleanup_errors_key(): + agent = _StubAgent(raise_in=()) + result = _run(agent) + assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL" + assert "cleanup_errors" not in result