mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
fix(computer-use): harden image-rejection fallback + AUTHOR_MAP
Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly.
This commit is contained in:
parent
2937f9bef6
commit
d0aad4b021
4 changed files with 288 additions and 5 deletions
48
run_agent.py
48
run_agent.py
|
|
@ -871,6 +871,15 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||
"Only 'text' content type is supported."). Mutates messages so the
|
||||
next API call sends text only.
|
||||
|
||||
Preserves message alternation invariants:
|
||||
* ``tool``-role messages whose content was entirely images are replaced
|
||||
with a plaintext placeholder, NOT deleted — deleting them would leave
|
||||
the paired ``tool_call_id`` on the prior assistant message unmatched,
|
||||
which providers reject with HTTP 400.
|
||||
* Non-tool messages whose content becomes empty are dropped. In
|
||||
practice this only hits synthetic image-only user messages appended
|
||||
for attachment delivery; real user turns always include text.
|
||||
|
||||
Returns True if any image parts were removed.
|
||||
"""
|
||||
found = False
|
||||
|
|
@ -890,9 +899,13 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||
if len(new_parts) < len(content):
|
||||
if new_parts:
|
||||
msg["content"] = new_parts
|
||||
elif msg.get("role") == "tool":
|
||||
# Preserve tool_call_id linkage — providers require every
|
||||
# assistant tool_call to have a matching tool response.
|
||||
msg["content"] = "[image content removed — server does not support images]"
|
||||
else:
|
||||
# Entire message was images — drop it (user messages added for
|
||||
# image delivery only, e.g. the deferred injection messages).
|
||||
# Synthetic image-only user/assistant message with no text;
|
||||
# safe to drop.
|
||||
to_delete.append(i)
|
||||
for i in reversed(to_delete):
|
||||
del messages[i]
|
||||
|
|
@ -12581,11 +12594,18 @@ class AIAgent:
|
|||
continue
|
||||
|
||||
# ── Image-rejection recovery ──────────────────────────────
|
||||
# Some providers (mlx-lm, text-only endpoints) reject any
|
||||
# message that contains image_url content with an error like
|
||||
# Some providers (mlx-lm, text-only endpoints, text-only
|
||||
# fallbacks on multimodal models) reject any message that
|
||||
# contains image_url content with a 4xx error like
|
||||
# "Only 'text' content type is supported." On first hit,
|
||||
# strip all images from the message list, mark the session
|
||||
# as vision-unsupported, and retry with text only.
|
||||
#
|
||||
# Detection is best-effort English phrase matching — a
|
||||
# locale-translated or heavily-reworded upstream error
|
||||
# will bypass this guard and fall through to the normal
|
||||
# error handler. Expand the phrase list when new
|
||||
# provider wordings are observed in the wild.
|
||||
_err_body = ""
|
||||
try:
|
||||
_err_body = str(getattr(api_error, "body", None) or
|
||||
|
|
@ -12593,17 +12613,35 @@ class AIAgent:
|
|||
str(api_error))
|
||||
except Exception:
|
||||
pass
|
||||
_err_status = getattr(api_error, "status_code", None)
|
||||
_IMAGE_REJECTION_PHRASES = (
|
||||
"only 'text' content type is supported",
|
||||
"only text content type is supported",
|
||||
"image_url is not supported",
|
||||
"image content is not supported",
|
||||
"multimodal is not supported",
|
||||
"multimodal content is not supported",
|
||||
"multimodal input is not supported",
|
||||
"vision is not supported",
|
||||
"vision input is not supported",
|
||||
"does not support images",
|
||||
"does not support image input",
|
||||
"does not support multimodal",
|
||||
"does not support vision",
|
||||
"model does not support image",
|
||||
)
|
||||
_err_lower = _err_body.lower()
|
||||
_looks_like_image_rejection = any(
|
||||
p in _err_lower for p in _IMAGE_REJECTION_PHRASES
|
||||
)
|
||||
# 4xx-only gate: never interpret 5xx/timeout as "server
|
||||
# said no to images" — those are transient and must
|
||||
# route to the normal retry path.
|
||||
_status_ok = _err_status is None or (400 <= int(_err_status) < 500)
|
||||
if (
|
||||
getattr(self, "_vision_supported", True)
|
||||
and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
|
||||
and _looks_like_image_rejection
|
||||
and _status_ok
|
||||
):
|
||||
self._vision_supported = False
|
||||
_imgs_removed = _strip_images_from_messages(messages)
|
||||
|
|
|
|||
|
|
@ -272,6 +272,7 @@ AUTHOR_MAP = {
|
|||
"104278804+Sertug17@users.noreply.github.com": "Sertug17",
|
||||
"112503481+caentzminger@users.noreply.github.com": "caentzminger",
|
||||
"258577966+voidborne-d@users.noreply.github.com": "voidborne-d",
|
||||
"3820588+ddupont808@users.noreply.github.com": "ddupont808",
|
||||
"liusway405@gmail.com": "voidborne-d",
|
||||
"xydarcher@uestc.edu.cn": "Readon",
|
||||
"sir_even@icloud.com": "sirEven",
|
||||
|
|
|
|||
243
tests/run_agent/test_image_rejection_fallback.py
Normal file
243
tests/run_agent/test_image_rejection_fallback.py
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
"""Tests for the image-rejection fallback in run_agent.
|
||||
|
||||
When a server rejects image content (e.g. text-only endpoints), the agent
|
||||
strips image parts from message history and retries text-only. These tests
|
||||
verify that stripping preserves the role-alternation invariants providers
|
||||
require, and that the phrase detector fires on the expected error bodies.
|
||||
"""
|
||||
|
||||
from run_agent import _strip_images_from_messages
|
||||
|
||||
|
||||
class TestStripImagesPreservesAlternation:
|
||||
"""_strip_images_from_messages must not break message role alternation."""
|
||||
|
||||
def test_noop_when_no_images(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "hi"},
|
||||
]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is False
|
||||
assert msgs == [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "hi"},
|
||||
]
|
||||
|
||||
def test_string_content_untouched(self):
|
||||
"""String content passes through — only list content is inspected."""
|
||||
msgs = [{"role": "user", "content": "just text"}]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is False
|
||||
assert msgs[0]["content"] == "just text"
|
||||
|
||||
def test_strips_image_url_part_preserves_text(self):
|
||||
msgs = [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "describe"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
|
||||
],
|
||||
}]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
assert msgs[0]["content"] == [{"type": "text", "text": "describe"}]
|
||||
|
||||
def test_strips_all_recognized_image_types(self):
|
||||
msgs = [{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "hi"},
|
||||
{"type": "image_url", "image_url": {}},
|
||||
{"type": "image", "source": {}},
|
||||
{"type": "input_image", "image_url": "http://x"},
|
||||
],
|
||||
}]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
assert msgs[0]["content"] == [{"type": "text", "text": "hi"}]
|
||||
|
||||
def test_tool_message_with_all_images_replaced_not_deleted(self):
|
||||
"""CRITICAL: tool messages must NEVER be deleted — their tool_call_id
|
||||
pairs with an assistant tool_call and providers reject unmatched IDs.
|
||||
"""
|
||||
msgs = [
|
||||
{"role": "user", "content": "take a screenshot"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"id": "call_abc",
|
||||
"type": "function",
|
||||
"function": {"name": "computer_use", "arguments": "{}"},
|
||||
}],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_abc",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
|
||||
],
|
||||
},
|
||||
]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
# Length preserved — tool message NOT deleted
|
||||
assert len(msgs) == 3
|
||||
# tool_call_id still present
|
||||
assert msgs[2]["tool_call_id"] == "call_abc"
|
||||
# Content replaced with text placeholder (now a string, not a list)
|
||||
assert isinstance(msgs[2]["content"], str)
|
||||
assert "image content removed" in msgs[2]["content"].lower()
|
||||
|
||||
def test_tool_message_with_mixed_content_keeps_text_parts(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "screenshot plz"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{"id": "call_1", "type": "function", "function": {"name": "x", "arguments": "{}"}}],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_1",
|
||||
"content": [
|
||||
{"type": "text", "text": "Captured 1024x768"},
|
||||
{"type": "image_url", "image_url": {"url": "data:..."}},
|
||||
],
|
||||
},
|
||||
]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
assert len(msgs) == 3
|
||||
assert msgs[2]["content"] == [{"type": "text", "text": "Captured 1024x768"}]
|
||||
assert msgs[2]["tool_call_id"] == "call_1"
|
||||
|
||||
def test_image_only_user_message_dropped(self):
|
||||
"""Synthetic image-only user messages (gateway injection pattern) are
|
||||
safe to drop — no tool_call_id linkage to preserve."""
|
||||
msgs = [
|
||||
{"role": "user", "content": "what's in this?"},
|
||||
{"role": "assistant", "content": "I'll check."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [{"type": "image_url", "image_url": {"url": "data:..."}}],
|
||||
},
|
||||
]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
# Synthetic image-only user message dropped
|
||||
assert len(msgs) == 2
|
||||
assert msgs[-1]["role"] == "assistant"
|
||||
|
||||
def test_multiple_tool_messages_all_preserved(self):
|
||||
"""Parallel tool calls: each tool_call_id must retain a paired message."""
|
||||
msgs = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [
|
||||
{"id": "c1", "type": "function", "function": {"name": "x", "arguments": "{}"}},
|
||||
{"id": "c2", "type": "function", "function": {"name": "x", "arguments": "{}"}},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "c1",
|
||||
"content": [{"type": "image_url", "image_url": {}}],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "c2",
|
||||
"content": [{"type": "image_url", "image_url": {}}],
|
||||
},
|
||||
]
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is True
|
||||
tool_msgs = [m for m in msgs if m.get("role") == "tool"]
|
||||
assert len(tool_msgs) == 2
|
||||
assert {m["tool_call_id"] for m in tool_msgs} == {"c1", "c2"}
|
||||
|
||||
def test_returns_false_when_nothing_changed(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": [{"type": "text", "text": "hi"}]},
|
||||
{"role": "assistant", "content": "hello"},
|
||||
]
|
||||
assert _strip_images_from_messages(msgs) is False
|
||||
|
||||
def test_handles_non_dict_entries_gracefully(self):
|
||||
msgs = [None, "not a dict", {"role": "user", "content": "ok"}]
|
||||
# Must not raise
|
||||
changed = _strip_images_from_messages(msgs)
|
||||
assert changed is False
|
||||
|
||||
|
||||
class TestImageRejectionPhraseIsolation:
|
||||
"""The image-rejection phrase list must NOT false-match on other
|
||||
image-related error categories (size-too-large, format errors, etc.)
|
||||
so they route to the correct recovery handler (e.g. _try_shrink_image_parts).
|
||||
"""
|
||||
|
||||
# Reproduces the phrase list used in run_agent.py's error-handler block.
|
||||
_REJECTION_PHRASES = (
|
||||
"only 'text' content type is supported",
|
||||
"only text content type is supported",
|
||||
"image_url is not supported",
|
||||
"image content is not supported",
|
||||
"multimodal is not supported",
|
||||
"multimodal content is not supported",
|
||||
"multimodal input is not supported",
|
||||
"vision is not supported",
|
||||
"vision input is not supported",
|
||||
"does not support images",
|
||||
"does not support image input",
|
||||
"does not support multimodal",
|
||||
"does not support vision",
|
||||
"model does not support image",
|
||||
)
|
||||
|
||||
def _matches(self, body: str) -> bool:
|
||||
low = body.lower()
|
||||
return any(p in low for p in self._REJECTION_PHRASES)
|
||||
|
||||
def test_anthropic_image_too_large_does_not_trip(self):
|
||||
# From agent/error_classifier.py _IMAGE_TOO_LARGE_PATTERNS —
|
||||
# these must route to image_too_large / _try_shrink_image_parts_in_messages,
|
||||
# NOT to our vision-unsupported fallback.
|
||||
bodies = [
|
||||
"messages.0.content.1.image.source.base64: image exceeds 5 MB maximum",
|
||||
"image too large: 6291456 bytes > 5242880 limit",
|
||||
"image_too_large",
|
||||
"image size exceeds per-request limit",
|
||||
]
|
||||
for body in bodies:
|
||||
assert self._matches(body) is False, f"false positive on: {body}"
|
||||
|
||||
def test_context_overflow_does_not_trip(self):
|
||||
bodies = [
|
||||
"This model's maximum context length is 200000 tokens.",
|
||||
"Request too large: max tokens per request is 200000",
|
||||
"The input exceeds the context window.",
|
||||
]
|
||||
for body in bodies:
|
||||
assert self._matches(body) is False, f"false positive on: {body}"
|
||||
|
||||
def test_rate_limit_does_not_trip(self):
|
||||
bodies = [
|
||||
"rate limit reached for requests",
|
||||
"You exceeded your current quota",
|
||||
]
|
||||
for body in bodies:
|
||||
assert self._matches(body) is False
|
||||
|
||||
def test_real_image_rejection_bodies_trip(self):
|
||||
"""Positive cases — real-world error wordings that should trigger."""
|
||||
bodies = [
|
||||
"Only 'text' content type is supported.",
|
||||
"Bad request: multimodal is not supported by this model",
|
||||
"This model does not support images",
|
||||
"vision is not supported on this endpoint",
|
||||
"model does not support image input",
|
||||
]
|
||||
for body in bodies:
|
||||
assert self._matches(body) is True, f"false negative on: {body}"
|
||||
|
|
@ -296,6 +296,7 @@ class TestBuiltinDiscovery:
|
|||
"tools.browser_tool",
|
||||
"tools.clarify_tool",
|
||||
"tools.code_execution_tool",
|
||||
"tools.computer_use_tool",
|
||||
"tools.cronjob_tools",
|
||||
"tools.delegate_tool",
|
||||
"tools.discord_tool",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue