fix(agent): recover from providers rejecting list-type tool content (#27344) (#30259)

Some providers (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible servers) follow the OpenAI spec strictly and require tool message `content` to be a string — they reject our list-type content (text + image_url parts) with HTTP 400 'text is not set' / 'tool message content must be a string'. Instead of an allowlist of known-good providers (maintenance burden, guaranteed to miss aggregators like OpenRouter where the underlying model determines support, not the aggregator name), this lands a reactive recovery: 1. New `FailoverReason.multimodal_tool_content_unsupported` with a small pattern list covering the common 400 wordings. 2. `AIAgent._try_strip_image_parts_from_tool_messages` walks the API message list, downgrades any `role:tool` message whose content is list-with-image to a plain text summary (preserves text parts) in place, AND records the active (provider, model) in a session-scoped `_no_list_tool_content_models` set. 3. `_tool_result_content_for_active_model` short-circuits to a text summary when (provider, model) is in the cache — so after the first 400 + retry, subsequent screenshots in the same session skip the round trip entirely. 4. Retry hook in `agent.conversation_loop` mirrors the existing `image_too_large` recovery: detect the reason, run the helper, retry once, fall through to the normal error path if no list-type tool content was actually present. Cache is transient (per-session) by design — next session retries in case the provider added support, no persistent state to maintain. Fixes #27344. Closes #27351 (allowlist approach superseded by reactive recovery).
2026-06-04 07:31:58 +00:00 · 2026-05-21 23:40:16 -07:00 · 2026-05-21 23:40:16 -07:00 · c769be344a
commit c769be344a
parent 372e9a18cd
5 changed files with 490 additions and 0 deletions
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -50,6 +50,7 @@ class FailoverReason(enum.Enum):

    # Request format
    format_error = "format_error"        # 400 bad request — abort or strip + retry
+    multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported"  # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry

    # Provider-specific
    thinking_signature = "thinking_signature"  # Anthropic thinking block sig invalid
@ -165,6 +166,32 @@ _IMAGE_TOO_LARGE_PATTERNS = [
    # the likely culprit; we still try the shrink path before giving up.
 ]

+# Providers that follow the OpenAI spec strictly require tool message
+# ``content`` to be a string.  Some (Anthropic native, Codex Responses,
+# Gemini native, first-party OpenAI) extend this to accept a content-parts
+# list (text + image_url) so screenshots from computer_use survive.  Others
+# (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible
+# providers) reject the list with a 400 — the patterns below are the most
+# common error shapes we see.  Recovery: strip image parts from tool
+# messages in-place, record the (provider, model) for the rest of the
+# session so we don't waste another call learning the same lesson, retry.
+#
+# See: https://github.com/NousResearch/hermes-agent/issues/27344
+_MULTIMODAL_TOOL_CONTENT_PATTERNS = [
+    # Xiaomi MiMo: {"error":{"code":"400","message":"Param Incorrect","param":"text is not set"}}
+    "text is not set",
+    # Generic "tool message must be string" shapes
+    "tool message content must be a string",
+    "tool content must be a string",
+    "tool message must be a string",
+    # OpenAI-compat servers that reject list-type tool content with a
+    # schema-validation message
+    "expected string, got list",
+    "expected string, got array",
+    # Alibaba/DashScope variant
+    "tool_call.content must be string",
+]
+
 # Context overflow patterns
 _CONTEXT_OVERFLOW_PATTERNS = [
    "context length",
@ -781,6 +808,19 @@ def _classify_400(
 ) -> ClassifiedError:
    """Classify 400 Bad Request — context overflow, format error, or generic."""

+    # Multimodal tool content rejected from 400.  Must be checked BEFORE
+    # image_too_large because the recovery is different (strip image parts
+    # from tool messages, mark the model as no-list-tool-content for the
+    # rest of the session) and BEFORE context_overflow because some of the
+    # patterns ("text is not set") are ambiguous in isolation but become
+    # specific when combined with a 400 on a request known to contain
+    # multimodal tool content.
+    if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
+        return result_fn(
+            FailoverReason.multimodal_tool_content_unsupported,
+            retryable=True,
+        )
+
    # Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way).
    # Must be checked BEFORE context_overflow because messages can trip both
    # patterns ("exceeds" + "image") and image-shrink is a cheaper recovery.
@ -922,6 +962,13 @@ def _classify_by_message(
            should_compress=True,
        )

+    # Multimodal tool content patterns (from message text when no status_code)
+    if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS):
+        return result_fn(
+            FailoverReason.multimodal_tool_content_unsupported,
+            retryable=True,
+        )
+
    # Image-too-large patterns (from message text when no status_code)
    if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS):
        return result_fn(