mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(agent): keep image tool results from poisoning text-only sessions
This commit is contained in:
parent
bc42e62b17
commit
a28add199d
2 changed files with 110 additions and 12 deletions
61
run_agent.py
61
run_agent.py
|
|
@ -9324,6 +9324,46 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
return transformed
|
return transformed
|
||||||
|
|
||||||
|
def _tool_result_content_for_active_model(self, tool_name: str, result: Any) -> Any:
|
||||||
|
"""Return the tool message content that is safe for the active model.
|
||||||
|
|
||||||
|
Multimodal tool results normally unwrap to OpenAI-style content parts so
|
||||||
|
vision-capable models can inspect screenshots. Text-only providers must
|
||||||
|
not receive those image parts, because a rejected tool result becomes
|
||||||
|
part of the canonical history and can make the next user turn fail before
|
||||||
|
the agent has a chance to recover.
|
||||||
|
"""
|
||||||
|
if not _is_multimodal_tool_result(result):
|
||||||
|
return result
|
||||||
|
|
||||||
|
content = result.get("content") or []
|
||||||
|
if not self._content_has_image_parts(content):
|
||||||
|
return content
|
||||||
|
|
||||||
|
if self._model_supports_vision():
|
||||||
|
return content
|
||||||
|
|
||||||
|
summary = _multimodal_text_summary(result)
|
||||||
|
if tool_name == "computer_use":
|
||||||
|
return json.dumps({
|
||||||
|
"error": (
|
||||||
|
"computer_use returned screenshot/image content, but the active "
|
||||||
|
"model/provider does not support image input. Switch to a "
|
||||||
|
"vision-capable model for desktop computer use, or use browser "
|
||||||
|
"tools for browser tasks."
|
||||||
|
),
|
||||||
|
"text_summary": summary,
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
"Tool %s returned image content for non-vision model %s/%s; "
|
||||||
|
"falling back to text summary",
|
||||||
|
tool_name,
|
||||||
|
self.provider,
|
||||||
|
self.model,
|
||||||
|
)
|
||||||
|
return summary
|
||||||
|
|
||||||
def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool:
|
def _try_shrink_image_parts_in_messages(self, api_messages: list) -> bool:
|
||||||
"""Re-encode all native image parts at a smaller size to recover from
|
"""Re-encode all native image parts at a smaller size to recover from
|
||||||
image-too-large errors (Anthropic 5 MB, unknown other providers).
|
image-too-large errors (Anthropic 5 MB, unknown other providers).
|
||||||
|
|
@ -11096,14 +11136,10 @@ class AIAgent:
|
||||||
# rather than a raw Python dict. The Anthropic adapter already
|
# rather than a raw Python dict. The Anthropic adapter already
|
||||||
# accepts content lists; vision-capable OpenAI-compatible servers
|
# accepts content lists; vision-capable OpenAI-compatible servers
|
||||||
# (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
|
# (mlx-vlm, GPT-4o, …) accept image_url in tool messages natively.
|
||||||
# Text-only servers that reject images are handled by the adaptive
|
# Text-only servers get a string-safe fallback here so a rejected
|
||||||
# _vision_supported recovery in the API retry loop.
|
# image tool result never poisons canonical session history.
|
||||||
# String results pass through unchanged.
|
# String results pass through unchanged.
|
||||||
_tool_content = (
|
_tool_content = self._tool_result_content_for_active_model(name, function_result)
|
||||||
function_result["content"]
|
|
||||||
if _is_multimodal_tool_result(function_result)
|
|
||||||
else function_result
|
|
||||||
)
|
|
||||||
tool_msg = {
|
tool_msg = {
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"name": name,
|
"name": name,
|
||||||
|
|
@ -11518,11 +11554,7 @@ class AIAgent:
|
||||||
|
|
||||||
# Unwrap _multimodal dicts to an OpenAI-style content list
|
# Unwrap _multimodal dicts to an OpenAI-style content list
|
||||||
# (see parallel path for rationale). String results pass through.
|
# (see parallel path for rationale). String results pass through.
|
||||||
_tool_content = (
|
_tool_content = self._tool_result_content_for_active_model(function_name, function_result)
|
||||||
function_result["content"]
|
|
||||||
if _is_multimodal_tool_result(function_result)
|
|
||||||
else function_result
|
|
||||||
)
|
|
||||||
tool_msg = {
|
tool_msg = {
|
||||||
"role": "tool",
|
"role": "tool",
|
||||||
"name": function_name,
|
"name": function_name,
|
||||||
|
|
@ -13535,6 +13567,11 @@ class AIAgent:
|
||||||
# we don't false-trip on other URL validation
|
# we don't false-trip on other URL validation
|
||||||
# errors. (issue #23570)
|
# errors. (issue #23570)
|
||||||
"image_url'. expected",
|
"image_url'. expected",
|
||||||
|
# DeepSeek's OpenAI-compatible API reports text-only
|
||||||
|
# request-body variants as:
|
||||||
|
# "unknown variant `image_url`, expected `text`".
|
||||||
|
"unknown variant `image_url`, expected `text`",
|
||||||
|
"unknown variant image_url, expected text",
|
||||||
)
|
)
|
||||||
_err_lower = _err_body.lower()
|
_err_lower = _err_body.lower()
|
||||||
_looks_like_image_rejection = any(
|
_looks_like_image_rejection = any(
|
||||||
|
|
|
||||||
|
|
@ -591,6 +591,67 @@ class TestRunAgentMultimodalHelpers:
|
||||||
for p in cleaned["content"]
|
for p in cleaned["content"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_computer_use_image_result_becomes_error_for_text_only_model(self):
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
agent = object.__new__(AIAgent)
|
||||||
|
agent.provider = "deepseek"
|
||||||
|
agent.model = "deepseek-v4-pro"
|
||||||
|
result = {
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "screen captured"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
|
||||||
|
],
|
||||||
|
"text_summary": "screen captured",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch.object(agent, "_model_supports_vision", return_value=False):
|
||||||
|
content = agent._tool_result_content_for_active_model("computer_use", result)
|
||||||
|
|
||||||
|
parsed = json.loads(content)
|
||||||
|
assert "computer_use returned screenshot/image content" in parsed["error"]
|
||||||
|
assert parsed["text_summary"] == "screen captured"
|
||||||
|
assert "image_url" not in content
|
||||||
|
|
||||||
|
def test_computer_use_image_result_preserved_for_vision_model(self):
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
agent = object.__new__(AIAgent)
|
||||||
|
result = {
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "screen captured"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||||
|
content = agent._tool_result_content_for_active_model("computer_use", result)
|
||||||
|
|
||||||
|
assert content is result["content"]
|
||||||
|
assert any(part.get("type") == "image_url" for part in content)
|
||||||
|
|
||||||
|
def test_other_multimodal_tool_uses_text_summary_for_text_only_model(self):
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
agent = object.__new__(AIAgent)
|
||||||
|
agent.provider = "custom"
|
||||||
|
agent.model = "text-only"
|
||||||
|
result = {
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "analysis text"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,x"}},
|
||||||
|
],
|
||||||
|
"text_summary": "analysis summary",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch.object(agent, "_model_supports_vision", return_value=False):
|
||||||
|
content = agent._tool_result_content_for_active_model("vision_analyze", result)
|
||||||
|
|
||||||
|
assert content == "analysis summary"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Universality: does the schema work without Anthropic?
|
# Universality: does the schema work without Anthropic?
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue