From c6a8bcb542e564fc61fa0d8f13a251bf4c659091 Mon Sep 17 00:00:00 2001 From: Evan Date: Thu, 9 Apr 2026 01:27:36 +0800 Subject: [PATCH] test(codex): cover empty final tool stream regression --- .../test_run_agent_codex_responses.py | 113 +++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py index ea703ffbb..66bf40723 100644 --- a/tests/run_agent/test_run_agent_codex_responses.py +++ b/tests/run_agent/test_run_agent_codex_responses.py @@ -148,7 +148,8 @@ def _codex_ack_message_response(text: str): class _FakeResponsesStream: - def __init__(self, *, final_response=None, final_error=None): + def __init__(self, *, events=None, final_response=None, final_error=None): + self._events = list(events or []) self._final_response = final_response self._final_error = final_error @@ -159,7 +160,7 @@ class _FakeResponsesStream: return False def __iter__(self): - return iter(()) + return iter(self._events) def get_final_response(self): if self._final_error is not None: @@ -374,6 +375,53 @@ def test_run_codex_stream_fallback_parses_create_stream_events(monkeypatch): assert response.output[0].content[0].text == "streamed create ok" +def test_run_codex_stream_backfills_tool_items_from_output_item_done(monkeypatch): + agent = _build_agent(monkeypatch) + tool_item_done = SimpleNamespace( + type="function_call", + status="completed", + call_id="call_1", + id="fc_1", + name="terminal", + arguments='{"command":"pwd"}', + ) + + stream = _FakeResponsesStream( + events=[ + SimpleNamespace(type="response.created"), + SimpleNamespace(type="response.in_progress"), + SimpleNamespace( + type="response.output_item.done", + output_index=0, + item=tool_item_done, + ), + SimpleNamespace(type="response.completed"), + ], + final_response=SimpleNamespace( + output=[], + output_text="", + status="completed", + model="gpt-5.4", + ), + ) + + agent.client = SimpleNamespace( + responses=SimpleNamespace( + stream=lambda **kwargs: stream, + create=lambda **kwargs: _codex_message_response("fallback"), + ) + ) + + response = agent._run_codex_stream(_codex_request_kwargs()) + + assistant_message, finish_reason = agent._normalize_codex_response(response) + assert finish_reason == "tool_calls" + assert len(assistant_message.tool_calls) == 1 + assert assistant_message.tool_calls[0].id == "call_1" + assert assistant_message.tool_calls[0].function.name == "terminal" + assert assistant_message.tool_calls[0].function.arguments == '{"command":"pwd"}' + + def test_run_conversation_codex_plain_text(monkeypatch): agent = _build_agent(monkeypatch) monkeypatch.setattr(agent, "_interruptible_api_call", lambda api_kwargs: _codex_message_response("OK")) @@ -386,6 +434,67 @@ def test_run_conversation_codex_plain_text(monkeypatch): assert result["messages"][-1]["content"] == "OK" +def test_run_conversation_codex_tool_stream_empty_final_output(monkeypatch): + agent = _build_agent(monkeypatch) + + first_stream = _FakeResponsesStream( + events=[ + SimpleNamespace(type="response.created"), + SimpleNamespace(type="response.in_progress"), + SimpleNamespace( + type="response.output_item.done", + output_index=0, + item=SimpleNamespace( + type="function_call", + status="completed", + call_id="call_1", + id="fc_1", + name="terminal", + arguments="{}", + ), + ), + SimpleNamespace(type="response.completed"), + ], + final_response=SimpleNamespace( + output=[], + output_text="", + status="completed", + model="gpt-5.4", + ), + ) + second_stream = _FakeResponsesStream(final_response=_codex_message_response("done")) + + streams = [first_stream, second_stream] + fake_client = SimpleNamespace( + responses=SimpleNamespace( + stream=lambda **kwargs: streams.pop(0), + create=lambda **kwargs: _codex_message_response("fallback"), + ) + ) + monkeypatch.setattr( + agent, + "_interruptible_api_call", + lambda api_kwargs: agent._run_codex_stream(api_kwargs, client=fake_client), + ) + + def _fake_execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count=0): + for call in assistant_message.tool_calls: + messages.append( + {"role": "tool", "tool_call_id": call.id, "content": '{"ok":true}'} + ) + + monkeypatch.setattr(agent, "_execute_tool_calls", _fake_execute_tool_calls) + + result = agent.run_conversation("run the terminal tool then finish") + + assert result["completed"] is True + assert result["final_response"] == "done" + assert any( + msg.get("role") == "tool" and msg.get("tool_call_id") == "call_1" + for msg in result["messages"] + ) + + def test_run_conversation_codex_empty_output_with_output_text(monkeypatch): """Regression: empty response.output + valid output_text should succeed, not trigger retry/fallback. The validation stage must defer to