fix(agent): close tool-call sequence on all interrupt aborts, not just finalize_turn

#48879 closed the tool-call sequence on interrupt inside finalize_turn so a
/stop after a tool no longer persists a `tool` tail that the next user message
turns into a `tool -> user` role-alternation violation (which strict providers
like Gemini/Claude react to by hallucinating a continuation and ignoring prior
context — what users see as "lost context after stop").

But the retry-wait, error-handling, and post-error retry-wait interrupt aborts
in conversation_loop return early and never reach finalize_turn, so they still
persisted and returned a raw `tool` tail. Interrupting during provider
backoff/rate-limiting (common under heavy work) hit exactly this path.

Extract the close into a shared close_interrupted_tool_sequence helper and apply
it at every interrupt abort (finalize_turn + the three early returns) so the
whole bug class is fixed, not just the one site.
This commit is contained in:
Brooklyn Nicholson 2026-06-25 12:19:53 -05:00
parent a6a28ce3e2
commit 2d286a6d00
4 changed files with 132 additions and 10 deletions

View file

@ -35,6 +35,7 @@ from agent.turn_context import build_turn_context
from agent.turn_retry_state import TurnRetryState
from agent.memory_manager import build_memory_context_block
from agent.message_sanitization import (
close_interrupted_tool_sequence,
_repair_tool_call_arguments,
_sanitize_messages_non_ascii,
_sanitize_messages_surrogates,
@ -1396,10 +1397,12 @@ def run_conversation(
while time.time() < sleep_end:
if agent._interrupt_requested:
agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
_interrupt_text = f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries})."
close_interrupted_tool_sequence(messages, _interrupt_text)
agent._persist_session(messages, conversation_history)
agent.clear_interrupt()
return {
"final_response": f"Operation interrupted during retry ({_failure_hint}, attempt {retry_count}/{max_retries}).",
"final_response": _interrupt_text,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
@ -2663,10 +2666,12 @@ def run_conversation(
# Check for interrupt before deciding to retry
if agent._interrupt_requested:
agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
_interrupt_text = f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))})."
close_interrupted_tool_sequence(messages, _interrupt_text)
agent._persist_session(messages, conversation_history)
agent.clear_interrupt()
return {
"final_response": f"Operation interrupted: handling API error ({error_type}: {agent._clean_error_message(str(api_error))}).",
"final_response": _interrupt_text,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
@ -3578,10 +3583,12 @@ def run_conversation(
while time.time() < sleep_end:
if agent._interrupt_requested:
agent._vprint(f"{agent.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
_interrupt_text = f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries})."
close_interrupted_tool_sequence(messages, _interrupt_text)
agent._persist_session(messages, conversation_history)
agent.clear_interrupt()
return {
"final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
"final_response": _interrupt_text,
"messages": messages,
"api_calls": api_call_count,
"completed": False,

View file

@ -279,6 +279,38 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str:
return "{}"
def close_interrupted_tool_sequence(messages: list, final_response: Any = None) -> bool:
"""Append a synthetic assistant turn when an interrupted tail is a tool result.
A turn cut short by ``/stop`` can leave the transcript ending on a raw
``tool`` message (a tool finished, or its execution was cancelled, but the
model never streamed a closing assistant turn). Persisting that tail means
the next user message lands as `` tool user`` a role-alternation
violation that strict providers (Gemini, Claude) react to by hallucinating
a continuation of the user's message and ignoring prior context, which
reads to the user as "lost context" (#48879).
``finalize_turn`` closes this on the happy interrupt path, but the
retry/backoff/error interrupt aborts in ``conversation_loop`` ``return``
early and never reach it this shared helper closes the sequence on all of
them. ``final_response`` is usually empty on an interrupt, so an explicit
placeholder is used rather than an empty-content assistant turn.
Mutates ``messages`` in place. Returns True if a closing turn was appended.
"""
if not messages:
return False
last = messages[-1]
if not isinstance(last, dict) or last.get("role") != "tool":
return False
text = final_response if isinstance(final_response, str) else ""
messages.append({
"role": "assistant",
"content": text.strip() or "Operation interrupted.",
})
return True
def _strip_non_ascii(text: str) -> str:
"""Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.
@ -431,6 +463,7 @@ def _sanitize_structure_non_ascii(payload: Any) -> bool:
__all__ = [
"_SURROGATE_RE",
"close_interrupted_tool_sequence",
"_sanitize_surrogates",
"_sanitize_structure_surrogates",
"_sanitize_messages_surrogates",

View file

@ -181,13 +181,9 @@ def finalize_turn(
# here instead. On an interrupt ``final_response`` is typically
# empty, so fall back to an explicit placeholder rather than
# persisting an empty-content assistant turn.
if interrupted and messages and messages[-1].get("role") == "tool":
messages.append(
{
"role": "assistant",
"content": (final_response or "").strip() or "Operation interrupted.",
}
)
if interrupted:
from agent.message_sanitization import close_interrupted_tool_sequence
close_interrupted_tool_sequence(messages, final_response)
agent._persist_session(messages, conversation_history)
except Exception as _persist_err:

View file

@ -0,0 +1,86 @@
"""Regression tests for ``close_interrupted_tool_sequence`` (#48879 follow-up).
#48879 closed the tool-call sequence on interrupt inside ``finalize_turn``,
but the retry/backoff/error interrupt aborts in ``conversation_loop`` ``return``
early and never reach it so they persisted a raw ``tool`` tail. The next user
message then lands as ``... tool user``, the role-alternation violation that
makes strict providers (Gemini, Claude) hallucinate a continuation and ignore
prior context (what the user perceives as "lost context").
The fix routes every interrupt abort through this one shared helper. These tests
pin the helper's contract and prove the post-interrupt + next-user-message
transcript is alternation-safe.
"""
from agent.message_sanitization import close_interrupted_tool_sequence
def _tool_tail():
return [
{"role": "user", "content": "edit the file"},
{
"role": "assistant",
"content": "",
"tool_calls": [{"id": "c1", "function": {"name": "patch", "arguments": "{}"}}],
},
{"role": "tool", "tool_call_id": "c1", "content": "ok edited"},
]
def _assert_no_tool_then_user(messages):
for i in range(len(messages) - 1):
if messages[i].get("role") == "tool":
assert messages[i + 1].get("role") != "user", (
f"role-alternation violation: tool → user at index {i}"
)
def test_tool_tail_is_closed_with_placeholder():
messages = _tool_tail()
assert close_interrupted_tool_sequence(messages, None) is True
assert messages[-1]["role"] == "assistant"
assert messages[-1]["content"] == "Operation interrupted."
def test_tool_tail_keeps_interrupt_text_when_present():
messages = _tool_tail()
close_interrupted_tool_sequence(messages, "Operation interrupted during retry (attempt 2/3).")
assert messages[-1]["role"] == "assistant"
assert messages[-1]["content"] == "Operation interrupted during retry (attempt 2/3)."
def test_blank_interrupt_text_falls_back_to_placeholder():
messages = _tool_tail()
close_interrupted_tool_sequence(messages, " ")
assert messages[-1]["content"] == "Operation interrupted."
def test_closing_makes_next_user_message_alternation_safe():
"""The whole point: appending a user turn after the close must not
produce the ``tool user`` shape strict providers choke on."""
messages = _tool_tail()
close_interrupted_tool_sequence(messages, None)
follow_on = messages + [{"role": "user", "content": "they do! increase the timing"}]
_assert_no_tool_then_user(follow_on)
def test_assistant_tail_is_left_untouched():
messages = [
{"role": "user", "content": "hi"},
{"role": "assistant", "content": "partial reply"},
]
before = [dict(m) for m in messages]
assert close_interrupted_tool_sequence(messages, "interrupted") is False
assert messages == before
def test_user_tail_is_left_untouched():
messages = [{"role": "user", "content": "hi"}]
assert close_interrupted_tool_sequence(messages, None) is False
assert len(messages) == 1
def test_empty_messages_is_noop():
messages = []
assert close_interrupted_tool_sequence(messages, "x") is False
assert messages == []