fix(agent): keep image tool results from poisoning text-only sessions

This commit is contained in:
helix4u 2026-05-14 13:22:08 -06:00 committed by Teknium
parent bc42e62b17
commit a28add199d
2 changed files with 110 additions and 12 deletions

View file

@ -9324,6 +9324,46 @@ class AIAgent:
)
return transformed
def _tool_result_content_for_active_model(self, tool_name: str, result: Any) -> Any:
"""Return the tool message content that is safe for the active model.
Multimodal tool results normally unwrap to OpenAI-style content parts so
vision-capable models can inspect screenshots. Text-only providers must
not receive those image parts, because a rejected tool result becomes
part of the canonical history and can make the next user turn fail before
the agent has a chance to recover.
"""
if not _is_multimodal_tool_result(result):
return result
content = result.get("content") or []
if not self._content_has_image_parts(content):
return content
if self._model_supports_vision():
return content
summary = _multimodal_text_summary(result)
if tool_name == "computer_use":
return json.dumps({
"error": (
"computer_use returned screenshot/image content, but the active "
"model/provider does not support image input. Switch to a "
"vision-capable model for desktop computer use, or use browser "
"tools for browser tasks."
),
"text_summary": summary,
})
logger.warning(
"Tool %s returned image content for non-vision model %s/%s; "
"falling back to text summary",
tool_name,
self.provider,
self.model,
)
return summary
def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool:
"""Re-encode all native image parts at a smaller size to recover from
image-too-large errors (Anthropic 5 MB, unknown other providers).
@ -11096,14 +11136,10 @@ class AIAgent:
# rather than a raw Python dict. The Anthropic adapter already
# accepts content lists; vision-capable OpenAI-compatible servers
# (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
# Text-only servers that reject images are handled by the adaptive
# _vision_supported recovery in the API retry loop.
# Text-only servers get a string-safe fallback here so a rejected
# image tool result never poisons canonical session history.
# String results pass through unchanged.
_tool_content = (
function_result["content"]
if _is_multimodal_tool_result(function_result)
else function_result
)
_tool_content = self._tool_result_content_for_active_model(name, function_result)
tool_msg = {
"role": "tool",
"name": name,
@ -11518,11 +11554,7 @@ class AIAgent:
# Unwrap _multimodal dicts to an OpenAI-style content list
# (see parallel path for rationale). String results pass through.
_tool_content = (
function_result["content"]
if _is_multimodal_tool_result(function_result)
else function_result
)
_tool_content = self._tool_result_content_for_active_model(function_name, function_result)
tool_msg = {
"role": "tool",
"name": function_name,
@ -13535,6 +13567,11 @@ class AIAgent:
# we don't false-trip on other URL validation
# errors. (issue #23570)
"image_url'. expected",
# DeepSeek's OpenAI-compatible API reports text-only
# request-body variants as:
# "unknown variant `image_url`, expected `text`".
"unknown variant `image_url`, expected `text`",
"unknown variant image_url, expected text",
)
_err_lower = _err_body.lower()
_looks_like_image_rejection = any(