From d6785dc4d40cdd37d2ea1e28d5f012572b3cf17e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 12 Apr 2026 15:38:11 -0700 Subject: [PATCH] fix: empty response recovery for reasoning models (mimo, qwen, GLM) (#8609) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for the (empty) response bug affecting open reasoning models: 1. Allow retries after prefill exhaustion — models like mimo-v2-pro always populate reasoning fields via OpenRouter, so the old 'not _has_structured' guard on the retry path blocked retries for EVERY reasoning model after the 2 prefill attempts. Now: 2 prefills + 3 retries = 6 total attempts before (empty). 2. Reset prefill/retry counters on tool-call recovery — the counters accumulated across the entire conversation, never resetting during tool-calling turns. A model cycling empty→prefill→tools→empty burned both prefill attempts and the third empty got zero recovery. Now counters reset when prefill succeeds with tool calls. 3. Strip think blocks before _truly_empty check — inline content made the string non-empty, skipping both retry paths. Reported by users on Telegram with xiaomi/mimo-v2-pro and qwen3.5 models. Reproduced: qwen3.5-9b emits tool calls as XML in reasoning field instead of proper function calls, causing content=None + tool_calls=None + reasoning with embedded XML. Prefill recovery works but counter accumulation caused permanent (empty) in long sessions. --- run_agent.py | 40 +++++++++++++++++++++++-------- tests/run_agent/test_run_agent.py | 14 +++++------ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/run_agent.py b/run_agent.py index 360ef05177..4c0d3be4b0 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9736,12 +9736,25 @@ class AIAgent: # Pop thinking-only prefill message(s) before appending # (tool-call path — same rationale as the final-response path). + _had_prefill = False while ( messages and isinstance(messages[-1], dict) and messages[-1].get("_thinking_prefill") ): messages.pop() + _had_prefill = True + + # Reset prefill counter when tool calls follow a prefill + # recovery. Without this, the counter accumulates across + # the whole conversation — a model that intermittently + # empties (empty → prefill → tools → empty → prefill → + # tools) burns both prefill attempts and the third empty + # gets zero recovery. Resetting here treats each tool- + # call success as a fresh start. + if _had_prefill: + self._thinking_prefill_retries = 0 + self._empty_content_retries = 0 messages.append(assistant_msg) self._emit_interim_assistant_message(assistant_msg) @@ -9917,16 +9930,23 @@ class AIAgent: self._save_session_log(messages) continue - # ── Empty response retry (no reasoning) ────── - # Model returned nothing — no content, no - # structured reasoning, no tool calls. Common - # with open models (transient provider issues, - # rate limits, sampling flukes). Retry up to 3 - # times before attempting fallback. Skip when - # content has inline tags (model chose - # to reason, just no visible text). - _truly_empty = not final_response.strip() - if _truly_empty and not _has_structured and self._empty_content_retries < 3: + # ── Empty response retry ────────────────────── + # Model returned nothing usable. Retry up to 3 + # times before attempting fallback. This covers + # both truly empty responses (no content, no + # reasoning) AND reasoning-only responses after + # prefill exhaustion — models like mimo-v2-pro + # always populate reasoning fields via OpenRouter, + # so the old `not _has_structured` guard blocked + # retries for every reasoning model after prefill. + _truly_empty = not self._strip_think_blocks( + final_response + ).strip() + _prefill_exhausted = ( + _has_structured + and self._thinking_prefill_retries >= 2 + ) + if _truly_empty and (not _has_structured or _prefill_exhausted) and self._empty_content_retries < 3: self._empty_content_retries += 1 logger.warning( "Empty response (no content or reasoning) — " diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index e4ae10f20c..2112ddc3f0 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -1741,9 +1741,9 @@ class TestRunConversation: {"role": "assistant", "content": "old answer"}, ] - # 3 responses: original + 2 prefill continuations (structured reasoning triggers prefill) + # 6 responses: original + 2 prefill + 3 retries after prefill exhaustion with ( - patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, empty_resp, empty_resp]), + patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp] * 6), patch.object(agent, "_compress_context") as mock_compress, patch.object(agent, "_persist_session"), patch.object(agent, "_save_trajectory"), @@ -1754,18 +1754,18 @@ class TestRunConversation: mock_compress.assert_not_called() # no compression triggered assert result["completed"] is True assert result["final_response"] == "(empty)" - assert result["api_calls"] == 3 # 1 original + 2 prefill continuations + assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries def test_reasoning_only_response_prefill_then_empty(self, agent): - """Structured reasoning-only triggers prefill continuation (up to 2), then falls through to (empty).""" + """Structured reasoning-only triggers prefill (2), then retries (3), then (empty).""" self._setup_agent(agent) empty_resp = _mock_response( content=None, finish_reason="stop", reasoning_content="structured reasoning answer", ) - # 3 responses: original + 2 prefill continuations, all reasoning-only - agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp] + # 6 responses: 1 original + 2 prefill + 3 retries after prefill exhaustion + agent.client.chat.completions.create.side_effect = [empty_resp] * 6 with ( patch.object(agent, "_persist_session"), patch.object(agent, "_save_trajectory"), @@ -1774,7 +1774,7 @@ class TestRunConversation: result = agent.run_conversation("answer me") assert result["completed"] is True assert result["final_response"] == "(empty)" - assert result["api_calls"] == 3 # 1 original + 2 prefill continuations + assert result["api_calls"] == 6 # 1 original + 2 prefill + 3 retries def test_reasoning_only_prefill_succeeds_on_continuation(self, agent): """When prefill continuation produces content, it becomes the final response."""