fix(codex): detect leaked tool-call text in assistant content (#15347)

gpt-5.x on the Codex Responses API sometimes degenerates and emits Harmony-style `to=functions.<name> {json}` serialization as plain assistant-message text instead of a structured `function_call` item. The intent never makes it into `response.output` as a function_call, so `tool_calls` is empty and `_normalize_codex_response()` returns the leaked text as the final content. Downstream (e.g. delegate_task), this surfaces as a confident-looking summary with `tool_trace: []` because no tools actually ran — the Taiwan-embassy-email bug report. Detect the pattern, scrub the content, and return finish_reason= 'incomplete' so the existing Codex-incomplete continuation path (run_agent.py:11331, 3 retries) gets a chance to re-elicit a proper function_call item. Encrypted reasoning items are preserved so the model keeps its chain-of-thought on the retry. Regression tests: leaked text triggers incomplete, real tool calls alongside leak-looking text are preserved, clean responses pass through unchanged. Reported on Discord (gpt-5.4 / openai-codex).
2026-04-25 00:51:20 +00:00 · 2026-04-24 14:39:59 -07:00 · 2026-04-24 14:39:59 -07:00 · 4093ee9c62
commit 4093ee9c62
parent 6a957a74bc
2 changed files with 157 additions and 0 deletions
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -23,6 +23,23 @@ from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
 logger = logging.getLogger(__name__)


+# Matches Codex/Harmony tool-call serialization that occasionally leaks into
+# assistant-message content when the model fails to emit a structured
+# ``function_call`` item.  Accepts the common forms:
+#
+#   to=functions.exec_command
+#   assistant to=functions.exec_command
+#   <|channel|>commentary to=functions.exec_command
+#
+# ``to=functions.<name>`` is the stable marker — the optional ``assistant`` or
+# Harmony channel prefix varies by degeneration mode.  Case-insensitive to
+# cover lowercase/uppercase ``assistant`` variants.
+_TOOL_CALL_LEAK_PATTERN = re.compile(
+    r"(?:^|[\s>|])to=functions\.[A-Za-z_][\w.]*",
+    re.IGNORECASE,
+)
+
+
 # ---------------------------------------------------------------------------
 # Multimodal content helpers
 # ---------------------------------------------------------------------------
@ -787,6 +804,37 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
        if isinstance(out_text, str):
            final_text = out_text.strip()

+    # ── Tool-call leak recovery ──────────────────────────────────
+    # gpt-5.x on the Codex Responses API sometimes degenerates and emits
+    # what should be a structured `function_call` item as plain assistant
+    # text using the Harmony/Codex serialization (``to=functions.foo
+    # {json}`` or ``assistant to=functions.foo {json}``). The model
+    # intended to call a tool, but the intent never made it into
+    # ``response.output`` as a ``function_call`` item, so ``tool_calls``
+    # is empty here. If we pass this through, the parent sees a
+    # confident-looking summary with no audit trail (empty ``tool_trace``)
+    # and no tools actually ran — the Taiwan-embassy-email incident.
+    #
+    # Detection: leaked tokens always contain ``to=functions.<name>`` and
+    # the assistant message has no real tool calls. Treat it as incomplete
+    # so the existing Codex-incomplete continuation path (3 retries,
+    # handled in run_agent.py) gets a chance to re-elicit a proper
+    # ``function_call`` item. The existing loop already handles message
+    # append, dedup, and retry budget.
+    leaked_tool_call_text = False
+    if final_text and not tool_calls and _TOOL_CALL_LEAK_PATTERN.search(final_text):
+        leaked_tool_call_text = True
+        logger.warning(
+            "Codex response contains leaked tool-call text in assistant content "
+            "(no structured function_call items). Treating as incomplete so the "
+            "continuation path can re-elicit a proper tool call. Leaked snippet: %r",
+            final_text[:300],
+        )
+        # Clear the text so downstream code doesn't surface the garbage as
+        # a summary. The encrypted reasoning items (if any) are preserved
+        # so the model keeps its chain-of-thought on the retry.
+        final_text = ""
+
    assistant_message = SimpleNamespace(
        content=final_text,
        tool_calls=tool_calls,
@ -798,6 +846,8 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:

    if tool_calls:
        finish_reason = "tool_calls"
+    elif leaked_tool_call_text:
+        finish_reason = "incomplete"
    elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
        finish_reason = "incomplete"
    elif reasoning_items_raw and not final_text: