diff --git a/run_agent.py b/run_agent.py index b60f6c43ce6..906f706d08a 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9324,6 +9324,46 @@ class AIAgent: ) return transformed + def _tool_result_content_for_active_model(self, tool_name: str, result: Any) -> Any: + """Return the tool message content that is safe for the active model. + + Multimodal tool results normally unwrap to OpenAI-style content parts so + vision-capable models can inspect screenshots. Text-only providers must + not receive those image parts, because a rejected tool result becomes + part of the canonical history and can make the next user turn fail before + the agent has a chance to recover. + """ + if not _is_multimodal_tool_result(result): + return result + + content = result.get("content") or [] + if not self._content_has_image_parts(content): + return content + + if self._model_supports_vision(): + return content + + summary = _multimodal_text_summary(result) + if tool_name == "computer_use": + return json.dumps({ + "error": ( + "computer_use returned screenshot/image content, but the active " + "model/provider does not support image input. Switch to a " + "vision-capable model for desktop computer use, or use browser " + "tools for browser tasks." + ), + "text_summary": summary, + }) + + logger.warning( + "Tool %s returned image content for non-vision model %s/%s; " + "falling back to text summary", + tool_name, + self.provider, + self.model, + ) + return summary + def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool: """Re-encode all native image parts at a smaller size to recover from image-too-large errors (Anthropic 5 MB, unknown other providers). @@ -11096,14 +11136,10 @@ class AIAgent: # rather than a raw Python dict. The Anthropic adapter already # accepts content lists; vision-capable OpenAI-compatible servers # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively. - # Text-only servers that reject images are handled by the adaptive - # _vision_supported recovery in the API retry loop. + # Text-only servers get a string-safe fallback here so a rejected + # image tool result never poisons canonical session history. # String results pass through unchanged. - _tool_content = ( - function_result["content"] - if _is_multimodal_tool_result(function_result) - else function_result - ) + _tool_content = self._tool_result_content_for_active_model(name, function_result) tool_msg = { "role": "tool", "name": name, @@ -11518,11 +11554,7 @@ class AIAgent: # Unwrap _multimodal dicts to an OpenAI-style content list # (see parallel path for rationale). String results pass through. - _tool_content = ( - function_result["content"] - if _is_multimodal_tool_result(function_result) - else function_result - ) + _tool_content = self._tool_result_content_for_active_model(function_name, function_result) tool_msg = { "role": "tool", "name": function_name, @@ -13535,6 +13567,11 @@ class AIAgent: # we don't false-trip on other URL validation # errors. (issue #23570) "image_url'. expected", + # DeepSeek's OpenAI-compatible API reports text-only + # request-body variants as: + # "unknown variant `image_url`, expected `text`". + "unknown variant `image_url`, expected `text`", + "unknown variant image_url, expected text", ) _err_lower = _err_body.lower() _looks_like_image_rejection = any( diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 58700dcaaf2..5b035950348 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -591,6 +591,67 @@ class TestRunAgentMultimodalHelpers: for p in cleaned["content"] ) + def test_computer_use_image_result_becomes_error_for_text_only_model(self): + from run_agent import AIAgent + + agent = object.__new__(AIAgent) + agent.provider = "deepseek" + agent.model = "deepseek-v4-pro" + result = { + "_multimodal": True, + "content": [ + {"type": "text", "text": "screen captured"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}}, + ], + "text_summary": "screen captured", + } + + with patch.object(agent, "_model_supports_vision", return_value=False): + content = agent._tool_result_content_for_active_model("computer_use", result) + + parsed = json.loads(content) + assert "computer_use returned screenshot/image content" in parsed["error"] + assert parsed["text_summary"] == "screen captured" + assert "image_url" not in content + + def test_computer_use_image_result_preserved_for_vision_model(self): + from run_agent import AIAgent + + agent = object.__new__(AIAgent) + result = { + "_multimodal": True, + "content": [ + {"type": "text", "text": "screen captured"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}}, + ], + } + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("computer_use", result) + + assert content is result["content"] + assert any(part.get("type") == "image_url" for part in content) + + def test_other_multimodal_tool_uses_text_summary_for_text_only_model(self): + from run_agent import AIAgent + + agent = object.__new__(AIAgent) + agent.provider = "custom" + agent.model = "text-only" + result = { + "_multimodal": True, + "content": [ + {"type": "text", "text": "analysis text"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}}, + ], + "text_summary": "analysis summary", + } + + with patch.object(agent, "_model_supports_vision", return_value=False): + content = agent._tool_result_content_for_active_model("vision_analyze", result) + + assert content == "analysis summary" + # --------------------------------------------------------------------------- # Universality: does the schema work without Anthropic?