diff --git a/run_agent.py b/run_agent.py index e519ea85e3..91ab7d5099 100644 --- a/run_agent.py +++ b/run_agent.py @@ -864,6 +864,41 @@ def _sanitize_tools_non_ascii(tools: list) -> bool: return _sanitize_structure_non_ascii(tools) +def _strip_images_from_messages(messages: list) -> bool: + """Remove image_url content parts from all messages in-place. + + Called when a server signals it does not support images (e.g. + "Only 'text' content type is supported."). Mutates messages so the + next API call sends text only. + + Returns True if any image parts were removed. + """ + found = False + to_delete = [] + for i, msg in enumerate(messages): + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + new_parts = [] + for part in content: + if isinstance(part, dict) and part.get("type") in ("image_url", "image", "input_image"): + found = True + else: + new_parts.append(part) + if len(new_parts) < len(content): + if new_parts: + msg["content"] = new_parts + else: + # Entire message was images — drop it (user messages added for + # image delivery only, e.g. the deferred injection messages). + to_delete.append(i) + for i in reversed(to_delete): + del messages[i] + return found + + def _sanitize_structure_non_ascii(payload: Any) -> bool: """Strip non-ASCII characters from nested dict/list payloads in-place.""" found = False @@ -10251,10 +10286,23 @@ class AIAgent: else: function_result += subdir_hints + # Unwrap _multimodal dicts to an OpenAI-style content list so any + # vision-capable provider receives [{type:text},{type:image_url}] + # rather than a raw Python dict. The Anthropic adapter already + # accepts content lists; vision-capable OpenAI-compatible servers + # (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively. + # Text-only servers that reject images are handled by the adaptive + # _vision_supported recovery in the API retry loop. + # String results pass through unchanged. + _tool_content = ( + function_result["content"] + if _is_multimodal_tool_result(function_result) + else function_result + ) tool_msg = { "role": "tool", "name": name, - "content": function_result, + "content": _tool_content, "tool_call_id": tc.id, } messages.append(tool_msg) @@ -10650,10 +10698,17 @@ class AIAgent: else: function_result += subdir_hints + # Unwrap _multimodal dicts to an OpenAI-style content list + # (see parallel path for rationale). String results pass through. + _tool_content = ( + function_result["content"] + if _is_multimodal_tool_result(function_result) + else function_result + ) tool_msg = { "role": "tool", "name": function_name, - "content": function_result, + "content": _tool_content, "tool_call_id": tool_call.id } messages.append(tool_msg) @@ -10702,7 +10757,6 @@ class AIAgent: self._apply_pending_steer_to_tool_results(messages, num_tools_seq) - def _handle_max_iterations(self, messages: list, api_call_count: int) -> str: """Request a summary when max iterations are reached. Returns the final response text.""" print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...") @@ -10985,6 +11039,11 @@ class AIAgent: self._unicode_sanitization_passes = 0 self._tool_guardrails.reset_for_turn() self._tool_guardrail_halt_decision = None + # True until the server rejects an image_url content part with an error + # like "Only 'text' content type is supported." Set to False on first + # rejection and kept False for the rest of the session so we never re-send + # images to a text-only endpoint. Scoped per `_run()` call, not per instance. + self._vision_supported = True # Pre-turn connection health check: detect and clean up dead TCP # connections left over from provider outages or dropped streams. @@ -12521,6 +12580,43 @@ class AIAgent: ) continue + # ── Image-rejection recovery ────────────────────────────── + # Some providers (mlx-lm, text-only endpoints) reject any + # message that contains image_url content with an error like + # "Only 'text' content type is supported." On first hit, + # strip all images from the message list, mark the session + # as vision-unsupported, and retry with text only. + _err_body = "" + try: + _err_body = str(getattr(api_error, "body", None) or + getattr(api_error, "message", None) or + str(api_error)) + except Exception: + pass + _IMAGE_REJECTION_PHRASES = ( + "only 'text' content type is supported", + "only text content type is supported", + "image_url is not supported", + "multimodal is not supported", + "vision is not supported", + "does not support images", + ) + if ( + getattr(self, "_vision_supported", True) + and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES) + ): + self._vision_supported = False + _imgs_removed = _strip_images_from_messages(messages) + if isinstance(api_messages, list): + _strip_images_from_messages(api_messages) + self._vprint( + f"{self.log_prefix}⚠️ Server rejected image content — " + f"switching to text-only mode for this session" + + (". Stripped images from history and retrying." if _imgs_removed else "."), + force=True, + ) + continue + status_code = getattr(api_error, "status_code", None) error_context = self._extract_api_error_context(api_error)