fix(computer-use): harden image-rejection fallback + AUTHOR_MAP

Follow-up to #15328's vision-unsupported retry branch in run_agent.py. _strip_images_from_messages() previously deleted any message whose content was entirely images. That's fine for synthetic user messages injected for attachment delivery, but it breaks providers for tool-role messages — the paired tool_call_id on the preceding assistant message ends up unmatched, which OpenAI-compatible APIs reject with HTTP 400. Fix: tool-role messages whose content becomes empty are replaced with a plaintext placeholder that preserves the tool_call_id linkage. Only non-tool messages are dropped. Added 10 tests covering the role-alternation invariants + image-type coverage. Image-rejection detector: expanded phrase list (image content not supported / multimodal input / vision input / model does not support image) and gated on 4xx status so transient 5xx errors never get misinterpreted as 'server said no to images'. Detection is documented as best-effort English phrase matching. AUTHOR_MAP: mapped 3820588+ddupont808@users.noreply.github.com to ddupont808 so release notes attribute the salvage correctly.
2026-05-10 03:22:05 +00:00 · 2026-04-28 01:15:46 -07:00 · 2026-04-28 01:15:46 -07:00 · d0aad4b021
commit d0aad4b021
parent 2937f9bef6
4 changed files with 288 additions and 5 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -871,6 +871,15 @@ def _strip_images_from_messages(messages: list) -> bool:
    "Only 'text' content type is supported.").  Mutates messages so the
    next API call sends text only.

+    Preserves message alternation invariants:
+      * ``tool``-role messages whose content was entirely images are replaced
+        with a plaintext placeholder, NOT deleted — deleting them would leave
+        the paired ``tool_call_id`` on the prior assistant message unmatched,
+        which providers reject with HTTP 400.
+      * Non-tool messages whose content becomes empty are dropped.  In
+        practice this only hits synthetic image-only user messages appended
+        for attachment delivery; real user turns always include text.
+
    Returns True if any image parts were removed.
    """
    found = False
@ -890,9 +899,13 @@ def _strip_images_from_messages(messages: list) -> bool:
        if len(new_parts) < len(content):
            if new_parts:
                msg["content"] = new_parts
+            elif msg.get("role") == "tool":
+                # Preserve tool_call_id linkage — providers require every
+                # assistant tool_call to have a matching tool response.
+                msg["content"] = "[image content removed — server does not support images]"
            else:
-                # Entire message was images — drop it (user messages added for
-                # image delivery only, e.g. the deferred injection messages).
+                # Synthetic image-only user/assistant message with no text;
+                # safe to drop.
                to_delete.append(i)
    for i in reversed(to_delete):
        del messages[i]
@ -12581,11 +12594,18 @@ class AIAgent:
                            continue

                    # ── Image-rejection recovery ──────────────────────────────
-                    # Some providers (mlx-lm, text-only endpoints) reject any
-                    # message that contains image_url content with an error like
+                    # Some providers (mlx-lm, text-only endpoints, text-only
+                    # fallbacks on multimodal models) reject any message that
+                    # contains image_url content with a 4xx error like
                    # "Only 'text' content type is supported."  On first hit,
                    # strip all images from the message list, mark the session
                    # as vision-unsupported, and retry with text only.
+                    #
+                    # Detection is best-effort English phrase matching — a
+                    # locale-translated or heavily-reworded upstream error
+                    # will bypass this guard and fall through to the normal
+                    # error handler.  Expand the phrase list when new
+                    # provider wordings are observed in the wild.
                    _err_body = ""
                    try:
                        _err_body = str(getattr(api_error, "body", None) or
@ -12593,17 +12613,35 @@ class AIAgent:
                                        str(api_error))
                    except Exception:
                        pass
+                    _err_status = getattr(api_error, "status_code", None)
                    _IMAGE_REJECTION_PHRASES = (
                        "only 'text' content type is supported",
                        "only text content type is supported",
                        "image_url is not supported",
+                        "image content is not supported",
                        "multimodal is not supported",
+                        "multimodal content is not supported",
+                        "multimodal input is not supported",
                        "vision is not supported",
+                        "vision input is not supported",
                        "does not support images",
+                        "does not support image input",
+                        "does not support multimodal",
+                        "does not support vision",
+                        "model does not support image",
                    )
+                    _err_lower = _err_body.lower()
+                    _looks_like_image_rejection = any(
+                        p in _err_lower for p in _IMAGE_REJECTION_PHRASES
+                    )
+                    # 4xx-only gate: never interpret 5xx/timeout as "server
+                    # said no to images" — those are transient and must
+                    # route to the normal retry path.
+                    _status_ok = _err_status is None or (400 <= int(_err_status) < 500)
                    if (
                        getattr(self, "_vision_supported", True)
-                        and any(p in _err_body.lower() for p in _IMAGE_REJECTION_PHRASES)
+                        and _looks_like_image_rejection
+                        and _status_ok
                    ):
                        self._vision_supported = False
                        _imgs_removed = _strip_images_from_messages(messages)