mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(codex): detect leaked tool-call text in assistant content (#15347)
gpt-5.x on the Codex Responses API sometimes degenerates and emits
Harmony-style `to=functions.<name> {json}` serialization as plain
assistant-message text instead of a structured `function_call` item.
The intent never makes it into `response.output` as a function_call,
so `tool_calls` is empty and `_normalize_codex_response()` returns
the leaked text as the final content. Downstream (e.g. delegate_task),
this surfaces as a confident-looking summary with `tool_trace: []`
because no tools actually ran — the Taiwan-embassy-email bug report.
Detect the pattern, scrub the content, and return finish_reason=
'incomplete' so the existing Codex-incomplete continuation path
(run_agent.py:11331, 3 retries) gets a chance to re-elicit a proper
function_call item. Encrypted reasoning items are preserved so the
model keeps its chain-of-thought on the retry.
Regression tests: leaked text triggers incomplete, real tool calls
alongside leak-looking text are preserved, clean responses pass
through unchanged.
Reported on Discord (gpt-5.4 / openai-codex).
This commit is contained in:
parent
6a957a74bc
commit
4093ee9c62
2 changed files with 157 additions and 0 deletions
|
|
@ -23,6 +23,23 @@ from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Matches Codex/Harmony tool-call serialization that occasionally leaks into
|
||||||
|
# assistant-message content when the model fails to emit a structured
|
||||||
|
# ``function_call`` item. Accepts the common forms:
|
||||||
|
#
|
||||||
|
# to=functions.exec_command
|
||||||
|
# assistant to=functions.exec_command
|
||||||
|
# <|channel|>commentary to=functions.exec_command
|
||||||
|
#
|
||||||
|
# ``to=functions.<name>`` is the stable marker — the optional ``assistant`` or
|
||||||
|
# Harmony channel prefix varies by degeneration mode. Case-insensitive to
|
||||||
|
# cover lowercase/uppercase ``assistant`` variants.
|
||||||
|
_TOOL_CALL_LEAK_PATTERN = re.compile(
|
||||||
|
r"(?:^|[\s>|])to=functions\.[A-Za-z_][\w.]*",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Multimodal content helpers
|
# Multimodal content helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -787,6 +804,37 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
|
||||||
if isinstance(out_text, str):
|
if isinstance(out_text, str):
|
||||||
final_text = out_text.strip()
|
final_text = out_text.strip()
|
||||||
|
|
||||||
|
# ── Tool-call leak recovery ──────────────────────────────────
|
||||||
|
# gpt-5.x on the Codex Responses API sometimes degenerates and emits
|
||||||
|
# what should be a structured `function_call` item as plain assistant
|
||||||
|
# text using the Harmony/Codex serialization (``to=functions.foo
|
||||||
|
# {json}`` or ``assistant to=functions.foo {json}``). The model
|
||||||
|
# intended to call a tool, but the intent never made it into
|
||||||
|
# ``response.output`` as a ``function_call`` item, so ``tool_calls``
|
||||||
|
# is empty here. If we pass this through, the parent sees a
|
||||||
|
# confident-looking summary with no audit trail (empty ``tool_trace``)
|
||||||
|
# and no tools actually ran — the Taiwan-embassy-email incident.
|
||||||
|
#
|
||||||
|
# Detection: leaked tokens always contain ``to=functions.<name>`` and
|
||||||
|
# the assistant message has no real tool calls. Treat it as incomplete
|
||||||
|
# so the existing Codex-incomplete continuation path (3 retries,
|
||||||
|
# handled in run_agent.py) gets a chance to re-elicit a proper
|
||||||
|
# ``function_call`` item. The existing loop already handles message
|
||||||
|
# append, dedup, and retry budget.
|
||||||
|
leaked_tool_call_text = False
|
||||||
|
if final_text and not tool_calls and _TOOL_CALL_LEAK_PATTERN.search(final_text):
|
||||||
|
leaked_tool_call_text = True
|
||||||
|
logger.warning(
|
||||||
|
"Codex response contains leaked tool-call text in assistant content "
|
||||||
|
"(no structured function_call items). Treating as incomplete so the "
|
||||||
|
"continuation path can re-elicit a proper tool call. Leaked snippet: %r",
|
||||||
|
final_text[:300],
|
||||||
|
)
|
||||||
|
# Clear the text so downstream code doesn't surface the garbage as
|
||||||
|
# a summary. The encrypted reasoning items (if any) are preserved
|
||||||
|
# so the model keeps its chain-of-thought on the retry.
|
||||||
|
final_text = ""
|
||||||
|
|
||||||
assistant_message = SimpleNamespace(
|
assistant_message = SimpleNamespace(
|
||||||
content=final_text,
|
content=final_text,
|
||||||
tool_calls=tool_calls,
|
tool_calls=tool_calls,
|
||||||
|
|
@ -798,6 +846,8 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
|
||||||
|
|
||||||
if tool_calls:
|
if tool_calls:
|
||||||
finish_reason = "tool_calls"
|
finish_reason = "tool_calls"
|
||||||
|
elif leaked_tool_call_text:
|
||||||
|
finish_reason = "incomplete"
|
||||||
elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
|
elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
|
||||||
finish_reason = "incomplete"
|
finish_reason = "incomplete"
|
||||||
elif reasoning_items_raw and not final_text:
|
elif reasoning_items_raw and not final_text:
|
||||||
|
|
|
||||||
|
|
@ -943,6 +943,113 @@ def test_normalize_codex_response_marks_commentary_only_message_as_incomplete(mo
|
||||||
assert "inspect the repository" in (assistant_message.content or "")
|
assert "inspect the repository" in (assistant_message.content or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_codex_response_detects_leaked_tool_call_text(monkeypatch):
|
||||||
|
"""Harmony-style `to=functions.foo` leaked into assistant content with no
|
||||||
|
structured function_call items must be treated as incomplete so the
|
||||||
|
continuation path can re-elicit a proper tool call. This is the
|
||||||
|
Taiwan-embassy-email (Discord bug report) failure mode: child agent
|
||||||
|
produces a confident-looking summary, tool_trace is empty because no
|
||||||
|
tools actually ran, parent can't audit the claim.
|
||||||
|
"""
|
||||||
|
agent = _build_agent(monkeypatch)
|
||||||
|
from agent.codex_responses_adapter import _normalize_codex_response
|
||||||
|
|
||||||
|
leaked_content = (
|
||||||
|
"I'll check the official page directly.\n"
|
||||||
|
"to=functions.exec_command {\"cmd\": \"curl https://example.test\"}\n"
|
||||||
|
"assistant to=functions.exec_command {\"stdout\": \"mailto:foo@example.test\"}\n"
|
||||||
|
"Extracted: foo@example.test"
|
||||||
|
)
|
||||||
|
response = SimpleNamespace(
|
||||||
|
output=[
|
||||||
|
SimpleNamespace(
|
||||||
|
type="message",
|
||||||
|
status="completed",
|
||||||
|
content=[SimpleNamespace(type="output_text", text=leaked_content)],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
|
||||||
|
status="completed",
|
||||||
|
model="gpt-5.4",
|
||||||
|
)
|
||||||
|
|
||||||
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
||||||
|
|
||||||
|
assert finish_reason == "incomplete"
|
||||||
|
# Content is scrubbed so the parent never surfaces the leaked text as a
|
||||||
|
# summary. tool_calls stays empty because no structured function_call
|
||||||
|
# item existed.
|
||||||
|
assert (assistant_message.content or "") == ""
|
||||||
|
assert assistant_message.tool_calls == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_codex_response_ignores_tool_call_text_when_real_tool_call_present(monkeypatch):
|
||||||
|
"""If the model emitted BOTH a structured function_call AND some text that
|
||||||
|
happens to contain `to=functions.*` (unlikely but possible), trust the
|
||||||
|
structured call — don't wipe content that came alongside a real tool use.
|
||||||
|
"""
|
||||||
|
agent = _build_agent(monkeypatch)
|
||||||
|
from agent.codex_responses_adapter import _normalize_codex_response
|
||||||
|
|
||||||
|
response = SimpleNamespace(
|
||||||
|
output=[
|
||||||
|
SimpleNamespace(
|
||||||
|
type="message",
|
||||||
|
status="completed",
|
||||||
|
content=[SimpleNamespace(
|
||||||
|
type="output_text",
|
||||||
|
text="Running the command via to=functions.exec_command now.",
|
||||||
|
)],
|
||||||
|
),
|
||||||
|
SimpleNamespace(
|
||||||
|
type="function_call",
|
||||||
|
id="fc_1",
|
||||||
|
call_id="call_1",
|
||||||
|
name="terminal",
|
||||||
|
arguments="{}",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
|
||||||
|
status="completed",
|
||||||
|
model="gpt-5.4",
|
||||||
|
)
|
||||||
|
|
||||||
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
||||||
|
|
||||||
|
assert finish_reason == "tool_calls"
|
||||||
|
assert assistant_message.tool_calls # real call preserved
|
||||||
|
assert "Running the command" in (assistant_message.content or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_codex_response_no_leak_passes_through(monkeypatch):
|
||||||
|
"""Sanity: normal assistant content that doesn't contain the leak pattern
|
||||||
|
is returned verbatim with finish_reason=stop."""
|
||||||
|
agent = _build_agent(monkeypatch)
|
||||||
|
from agent.codex_responses_adapter import _normalize_codex_response
|
||||||
|
|
||||||
|
response = SimpleNamespace(
|
||||||
|
output=[
|
||||||
|
SimpleNamespace(
|
||||||
|
type="message",
|
||||||
|
status="completed",
|
||||||
|
content=[SimpleNamespace(
|
||||||
|
type="output_text",
|
||||||
|
text="Here is the answer with no leak.",
|
||||||
|
)],
|
||||||
|
)
|
||||||
|
],
|
||||||
|
usage=SimpleNamespace(input_tokens=4, output_tokens=2, total_tokens=6),
|
||||||
|
status="completed",
|
||||||
|
model="gpt-5.4",
|
||||||
|
)
|
||||||
|
|
||||||
|
assistant_message, finish_reason = _normalize_codex_response(response)
|
||||||
|
|
||||||
|
assert finish_reason == "stop"
|
||||||
|
assert assistant_message.content == "Here is the answer with no leak."
|
||||||
|
assert assistant_message.tool_calls == []
|
||||||
|
|
||||||
|
|
||||||
def test_interim_commentary_is_not_marked_already_streamed_without_callbacks(monkeypatch):
|
def test_interim_commentary_is_not_marked_already_streamed_without_callbacks(monkeypatch):
|
||||||
agent = _build_agent(monkeypatch)
|
agent = _build_agent(monkeypatch)
|
||||||
observed = {}
|
observed = {}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue