mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-10 03:22:05 +00:00
fix(computer-use): harden image-rejection fallback + AUTHOR_MAP
Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly.
This commit is contained in:
parent
2937f9bef6
commit
d0aad4b021
4 changed files with 288 additions and 5 deletions
48
run_agent.py
48
run_agent.py
|
|
@ -871,6 +871,15 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||
"Only 'text' content type is supported."). Mutates messages so the
|
||||
next API call sends text only.
|
||||
|
||||
Preserves message alternation invariants:
|
||||
* ``tool``-role messages whose content was entirely images are replaced
|
||||
with a plaintext placeholder, NOT deleted — deleting them would leave
|
||||
the paired ``tool_call_id`` on the prior assistant message unmatched,
|
||||
which providers reject with HTTP 400.
|
||||
* Non-tool messages whose content becomes empty are dropped. In
|
||||
practice this only hits synthetic image-only user messages appended
|
||||
for attachment delivery; real user turns always include text.
|
||||
|
||||
Returns True if any image parts were removed.
|
||||
"""
|
||||
found = False
|
||||
|
|
@ -890,9 +899,13 @@ def _strip_images_from_messages(messages: list) -> bool:
|
|||
if len(new_parts) < len(content):
|
||||
if new_parts:
|
||||
msg["content"] = new_parts
|
||||
elif msg.get("role") == "tool":
|
||||
# Preserve tool_call_id linkage — providers require every
|
||||
# assistant tool_call to have a matching tool response.
|
||||
msg["content"] = "[image content removed — server does not support images]"
|
||||
else:
|
||||
# Entire message was images — drop it (user messages added for
|
||||
# image delivery only, e.g. the deferred injection messages).
|
||||
# Synthetic image-only user/assistant message with no text;
|
||||
# safe to drop.
|
||||
to_delete.append(i)
|
||||
for i in reversed(to_delete):
|
||||
del messages[i]
|
||||
|
|
@ -12581,11 +12594,18 @@ class AIAgent:
|
|||
continue
|
||||
|
||||
# ── Image-rejection recovery ──────────────────────────────
|
||||
# Some providers (mlx-lm, text-only endpoints) reject any
|
||||
# message that contains image_url content with an error like
|
||||
# Some providers (mlx-lm, text-only endpoints, text-only
|
||||
# fallbacks on multimodal models) reject any message that
|
||||
# contains image_url content with a 4xx error like
|
||||
# "Only 'text' content type is supported." On first hit,
|
||||
# strip all images from the message list, mark the session
|
||||
# as vision-unsupported, and retry with text only.
|
||||
#
|
||||
# Detection is best-effort English phrase matching — a
|
||||
# locale-translated or heavily-reworded upstream error
|
||||
# will bypass this guard and fall through to the normal
|
||||
# error handler. Expand the phrase list when new
|
||||
# provider wordings are observed in the wild.
|
||||
_err_body = ""
|
||||
try:
|
||||
_err_body = str(getattr(api_error, "body", None) or
|
||||
|
|
@ -12593,17 +12613,35 @@ class AIAgent:
|
|||
str(api_error))
|
||||
except Exception:
|
||||
pass
|
||||
_err_status = getattr(api_error, "status_code", None)
|
||||
_IMAGE_REJECTION_PHRASES = (
|
||||
"only 'text' content type is supported",
|
||||
"only text content type is supported",
|
||||
"image_url is not supported",
|
||||
"image content is not supported",
|
||||
"multimodal is not supported",
|
||||
"multimodal content is not supported",
|
||||
"multimodal input is not supported",
|
||||
"vision is not supported",
|
||||
"vision input is not supported",
|
||||
"does not support images",
|
||||
"does not support image input",
|
||||
"does not support multimodal",
|
||||
"does not support vision",
|
||||
"model does not support image",
|
||||
)
|
||||
_err_lower = _err_body.lower()
|
||||
_looks_like_image_rejection = any(
|
||||
p in _err_lower for p in _IMAGE_REJECTION_PHRASES
|
||||
)
|
||||
# 4xx-only gate: never interpret 5xx/timeout as "server
|
||||
# said no to images" — those are transient and must
|
||||
# route to the normal retry path.
|
||||
_status_ok = _err_status is None or (400 <= int(_err_status) < 500)
|
||||
if (
|
||||
getattr(self, "_vision_supported", True)
|
||||
and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
|
||||
and _looks_like_image_rejection
|
||||
and _status_ok
|
||||
):
|
||||
self._vision_supported = False
|
||||
_imgs_removed = _strip_images_from_messages(messages)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue