diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index d3cad0c35fd..fdf65c07558 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -989,6 +989,7 @@ def run_conversation( copilot_auth_retry_attempted=False thinking_sig_retry_attempted = False image_shrink_retry_attempted = False + multimodal_tool_content_retry_attempted = False oauth_1m_beta_retry_attempted = False llama_cpp_grammar_retry_attempted = False has_retried_429 = False @@ -2060,6 +2061,31 @@ def run_conversation( "or shrink didn't reduce size; surfacing original error." ) + # Multimodal-tool-content recovery: providers that follow + # the OpenAI spec strictly (tool message content must be a + # string) reject our list-type content with a 400. Strip + # image parts from any list-type tool messages, mark the + # (provider, model) as no-list-tool-content for the rest + # of this session so future tool results preemptively + # downgrade, and retry once. See issue #27344. + if ( + classified.reason == FailoverReason.multimodal_tool_content_unsupported + and not multimodal_tool_content_retry_attempted + ): + multimodal_tool_content_retry_attempted = True + if agent._try_strip_image_parts_from_tool_messages(api_messages): + agent._vprint( + f"{agent.log_prefix}📐 Provider rejected list-type tool content — " + f"downgraded screenshots to text and retrying...", + force=True, + ) + continue + else: + logger.info( + "multimodal-tool-content recovery: no list-type tool " + "messages with image parts found; surfacing original error." + ) + # Anthropic OAuth subscription rejected the 1M-context beta # header ("long context beta is not yet available for this # subscription"). Disable the beta for the rest of this diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 42eb42d6803..7fa38bbcf70 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -50,6 +50,7 @@ class FailoverReason(enum.Enum): # Request format format_error = "format_error" # 400 bad request — abort or strip + retry + multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported" # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry # Provider-specific thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid @@ -165,6 +166,32 @@ _IMAGE_TOO_LARGE_PATTERNS = [ # the likely culprit; we still try the shrink path before giving up. ] +# Providers that follow the OpenAI spec strictly require tool message +# ``content`` to be a string. Some (Anthropic native, Codex Responses, +# Gemini native, first-party OpenAI) extend this to accept a content-parts +# list (text + image_url) so screenshots from computer_use survive. Others +# (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible +# providers) reject the list with a 400 — the patterns below are the most +# common error shapes we see. Recovery: strip image parts from tool +# messages in-place, record the (provider, model) for the rest of the +# session so we don't waste another call learning the same lesson, retry. +# +# See: https://github.com/NousResearch/hermes-agent/issues/27344 +_MULTIMODAL_TOOL_CONTENT_PATTERNS = [ + # Xiaomi MiMo: {"error":{"code":"400","message":"Param Incorrect","param":"text is not set"}} + "text is not set", + # Generic "tool message must be string" shapes + "tool message content must be a string", + "tool content must be a string", + "tool message must be a string", + # OpenAI-compat servers that reject list-type tool content with a + # schema-validation message + "expected string, got list", + "expected string, got array", + # Alibaba/DashScope variant + "tool_call.content must be string", +] + # Context overflow patterns _CONTEXT_OVERFLOW_PATTERNS = [ "context length", @@ -781,6 +808,19 @@ def _classify_400( ) -> ClassifiedError: """Classify 400 Bad Request — context overflow, format error, or generic.""" + # Multimodal tool content rejected from 400. Must be checked BEFORE + # image_too_large because the recovery is different (strip image parts + # from tool messages, mark the model as no-list-tool-content for the + # rest of the session) and BEFORE context_overflow because some of the + # patterns ("text is not set") are ambiguous in isolation but become + # specific when combined with a 400 on a request known to contain + # multimodal tool content. + if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS): + return result_fn( + FailoverReason.multimodal_tool_content_unsupported, + retryable=True, + ) + # Image-too-large from 400 (Anthropic's 5 MB per-image check fires this way). # Must be checked BEFORE context_overflow because messages can trip both # patterns ("exceeds" + "image") and image-shrink is a cheaper recovery. @@ -922,6 +962,13 @@ def _classify_by_message( should_compress=True, ) + # Multimodal tool content patterns (from message text when no status_code) + if any(p in error_msg for p in _MULTIMODAL_TOOL_CONTENT_PATTERNS): + return result_fn( + FailoverReason.multimodal_tool_content_unsupported, + retryable=True, + ) + # Image-too-large patterns (from message text when no status_code) if any(p in error_msg for p in _IMAGE_TOO_LARGE_PATTERNS): return result_fn( diff --git a/run_agent.py b/run_agent.py index 001d03784ad..5b89839b645 100644 --- a/run_agent.py +++ b/run_agent.py @@ -3357,6 +3357,25 @@ class AIAgent: return content if self._model_supports_vision(): + # Vision-capable on paper — but if we've already learned in this + # session that the active (provider, model) rejects list-type + # tool content (e.g. Xiaomi MiMo's 400 "text is not set"), + # short-circuit to a text summary so we don't burn another + # round-trip relearning the same lesson. Cache populated by + # the 400 recovery path in agent.conversation_loop. Transient + # per-session; next session retries. + key = ( + (getattr(self, "provider", "") or "").strip().lower(), + (getattr(self, "model", "") or "").strip(), + ) + no_list = getattr(self, "_no_list_tool_content_models", None) + if no_list and key in no_list: + logger.debug( + "Tool %s: model %s/%s known to reject list-type tool " + "content this session — sending text summary", + tool_name, key[0], key[1], + ) + return _multimodal_text_summary(result) return content summary = _multimodal_text_summary(result) @@ -3385,6 +3404,80 @@ class AIAgent: from agent.conversation_compression import try_shrink_image_parts_in_messages return try_shrink_image_parts_in_messages(api_messages) + def _try_strip_image_parts_from_tool_messages(self, api_messages: list) -> bool: + """Downgrade list-type tool messages to text summaries in-place. + + Recovery path for providers that reject list-type tool message content + (e.g. Xiaomi MiMo's 400 "text is not set"; see issue #27344). Walks + ``api_messages`` for any ``role: "tool"`` message whose ``content`` is + a list containing image parts, replaces the content with the existing + text part(s) (or a minimal placeholder if none survive), and records + the active (provider, model) in ``self._no_list_tool_content_models`` + so subsequent ``_tool_result_content_for_active_model`` calls in this + session preemptively downgrade screenshots without a round-trip. + + Returns True when at least one tool message was downgraded — the + caller (the 400 recovery branch in ``agent.conversation_loop``) uses + this to decide whether to retry the API call with the modified + history or surface the original error. + """ + if not isinstance(api_messages, list): + return False + + # Record (provider, model) so we don't relearn this lesson. + key = ( + (getattr(self, "provider", "") or "").strip().lower(), + (getattr(self, "model", "") or "").strip(), + ) + if not hasattr(self, "_no_list_tool_content_models"): + self._no_list_tool_content_models = set() + if key[1]: # only record when we actually have a model id + self._no_list_tool_content_models.add(key) + + changed = False + for msg in api_messages: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + content = msg.get("content") + if not isinstance(content, list): + continue + + # Salvage any text parts so the model still sees some signal. + text_parts: List[str] = [] + had_image = False + for part in content: + if not isinstance(part, dict): + if isinstance(part, str) and part.strip(): + text_parts.append(part.strip()) + continue + ptype = part.get("type") + if ptype == "image_url" or ptype == "input_image": + had_image = True + continue + if ptype in {"text", "input_text"}: + text = str(part.get("text") or "").strip() + if text: + text_parts.append(text) + + if not had_image: + # List-type content but no image parts — leave alone (some + # providers reject ANY list content, but stripping a + # text-only list doesn't reduce ambiguity; let the caller + # surface the original error if this turns out to be the + # case). + continue + + if text_parts: + msg["content"] = "\n\n".join(text_parts) + else: + msg["content"] = ( + "[image content removed — provider does not accept " + "list-type tool message content]" + ) + changed = True + + return changed + def _anthropic_preserve_dots(self) -> bool: """True when using an anthropic-compatible endpoint that preserves dots in model names. Alibaba/DashScope keeps dots (e.g. qwen3.5-plus). diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index a6fb56a7075..eef3650347c 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -56,6 +56,7 @@ class TestFailoverReason: "overloaded", "server_error", "timeout", "context_overflow", "payload_too_large", "image_too_large", "model_not_found", "format_error", + "multimodal_tool_content_unsupported", "provider_policy_blocked", "thinking_signature", "long_context_tier", "oauth_long_context_beta_forbidden", @@ -1256,3 +1257,66 @@ class TestRateLimitErrorWithoutStatusCode: e.status_code = None result = classify_api_error(e, provider="copilot", model="gpt-4o") assert result.reason != FailoverReason.rate_limit + + + +# ── Test: multimodal_tool_content_unsupported pattern ─────────────────── + +class TestMultimodalToolContentUnsupported: + """Issue #27344 — providers that reject list-type tool message content + should be classified as ``multimodal_tool_content_unsupported`` so the + retry loop can downgrade screenshots to text and try again. + """ + + def test_xiaomi_mimo_text_is_not_set_pattern(self): + """The actual Xiaomi MiMo 400 wording from the bug report.""" + e = MockAPIError( + "Error code: 400 - {'error': {'code': '400', 'message': 'Param Incorrect', 'param': 'text is not set', 'type': ''}}", + status_code=400, + ) + result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + assert result.retryable is True + + def test_generic_tool_message_must_be_string(self): + e = MockAPIError( + "tool message content must be a string", + status_code=400, + ) + result = classify_api_error(e, provider="custom", model="some-model") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + + def test_expected_string_got_list(self): + e = MockAPIError( + "Schema validation failed: expected string, got list", + status_code=400, + ) + result = classify_api_error(e, provider="custom", model="some-model") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + + def test_multimodal_tool_content_takes_priority_over_context_overflow(self): + """Some providers return a 400 whose message contains BOTH + 'text is not set' and a length-shaped phrase; the tool-content + recovery is cheaper than compression so it must win the priority. + """ + e = MockAPIError( + "text is not set; context length exceeded", + status_code=400, + ) + result = classify_api_error(e, provider="xiaomi", model="mimo-v2.5") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + + def test_no_status_code_path_also_classifies(self): + """When the error reaches us without a status code (transport + layer ate it) the message-only classifier branch must also + recognise the pattern. + """ + e = MockTransportError("tool_call.content must be string") + result = classify_api_error(e, provider="alibaba", model="qwen3.5-plus") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + + def test_unrelated_400_is_not_misclassified(self): + """Make sure the patterns don't false-positive on normal 400s.""" + e = MockAPIError("bad request: missing field 'model'", status_code=400) + result = classify_api_error(e, provider="openrouter", model="anthropic/claude-sonnet-4") + assert result.reason != FailoverReason.multimodal_tool_content_unsupported diff --git a/tests/run_agent/test_multimodal_tool_content_recovery.py b/tests/run_agent/test_multimodal_tool_content_recovery.py new file mode 100644 index 00000000000..63ee49f97c0 --- /dev/null +++ b/tests/run_agent/test_multimodal_tool_content_recovery.py @@ -0,0 +1,260 @@ +"""Tests for reactive multimodal-tool-content recovery. + +Covers the full chain for providers that reject list-type content in +``role: "tool"`` messages (Xiaomi MiMo's 400 "text is not set", etc.): + + 1. agent/error_classifier.py: 400 with the right wording classifies as + ``FailoverReason.multimodal_tool_content_unsupported``. + 2. run_agent._try_strip_image_parts_from_tool_messages downgrades tool + messages whose ``content`` is a list-with-image to a string text + summary, in-place, and records the active (provider, model) in + ``self._no_list_tool_content_models`` so future tool results in this + session preemptively downgrade. + 3. run_agent._tool_result_content_for_active_model short-circuits to a + text summary when the (provider, model) is in the cache, even though + ``_model_supports_vision`` returns True — avoiding a wasted round + trip on every subsequent screenshot in the session. + +The end-to-end retry loop wiring (`conversation_loop.py`) is exercised by +the classifier signal + helper-mutation tests; the integration only adds +a trivial flag-and-continue around the existing pattern used for +``image_too_large`` recovery. + +See: https://github.com/NousResearch/hermes-agent/issues/27344 +""" + +from __future__ import annotations + +import pytest + +from agent.error_classifier import FailoverReason, classify_api_error + + +class _FakeApiError(Exception): + """Stand-in for an openai.BadRequestError with status_code + body.""" + + def __init__(self, status_code: int, message: str, body: dict | None = None): + super().__init__(message) + self.status_code = status_code + self.body = body or {"error": {"message": message}} + self.response = None + + +def _make_agent(provider: str = "xiaomi", model: str = "mimo-v2.5"): + """Build a bare AIAgent for method-level testing, no provider setup.""" + from run_agent import AIAgent + agent = object.__new__(AIAgent) + agent.provider = provider + agent.model = model + return agent + + +# ─── Strip helper ──────────────────────────────────────────────────────────── + + +class TestStripImagePartsHelper: + def test_no_messages_returns_false(self): + agent = _make_agent() + assert agent._try_strip_image_parts_from_tool_messages([]) is False + assert agent._try_strip_image_parts_from_tool_messages(None) is False + + def test_no_tool_messages_returns_false(self): + agent = _make_agent() + msgs = [ + {"role": "user", "content": "plain text"}, + {"role": "assistant", "content": "ack"}, + ] + assert agent._try_strip_image_parts_from_tool_messages(msgs) is False + + def test_tool_message_with_string_content_unchanged(self): + agent = _make_agent() + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": "plain string result"}, + ] + assert agent._try_strip_image_parts_from_tool_messages(msgs) is False + assert msgs[0]["content"] == "plain string result" + + def test_tool_message_list_without_image_unchanged(self): + """List content with only text parts is left alone — caller surfaces + the original error if this turns out to also be rejected.""" + agent = _make_agent() + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "text", "text": "hello"}, + ]}, + ] + assert agent._try_strip_image_parts_from_tool_messages(msgs) is False + + def test_tool_message_list_with_image_downgrades(self): + agent = _make_agent() + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "text", "text": "AX summary: 5 buttons visible"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}, + ]}, + ] + assert agent._try_strip_image_parts_from_tool_messages(msgs) is True + # Image stripped; text preserved as a string. + assert isinstance(msgs[0]["content"], str) + assert "AX summary" in msgs[0]["content"] + assert "image_url" not in msgs[0]["content"] + assert "iVBOR" not in msgs[0]["content"] + + def test_tool_message_image_only_gets_placeholder(self): + """If the list had nothing but image parts, leave a placeholder so + the assistant message has something to reference.""" + agent = _make_agent() + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}}, + ]}, + ] + assert agent._try_strip_image_parts_from_tool_messages(msgs) is True + assert isinstance(msgs[0]["content"], str) + assert "image content removed" in msgs[0]["content"] + + def test_records_provider_model_in_session_cache(self): + agent = _make_agent(provider="xiaomi", model="mimo-v2.5") + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "text", "text": "summary"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}}, + ]}, + ] + agent._try_strip_image_parts_from_tool_messages(msgs) + assert ("xiaomi", "mimo-v2.5") in agent._no_list_tool_content_models + + def test_only_tool_messages_get_downgraded(self): + """User / assistant messages with list-type content are out of + scope — they're handled by the existing image-routing path.""" + agent = _make_agent() + msgs = [ + {"role": "user", "content": [ + {"type": "text", "text": "describe"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}}, + ]}, + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "text", "text": "summary"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,Y"}}, + ]}, + ] + agent._try_strip_image_parts_from_tool_messages(msgs) + # User message untouched. + assert isinstance(msgs[0]["content"], list) + assert any(p.get("type") == "image_url" for p in msgs[0]["content"]) + # Tool message downgraded. + assert isinstance(msgs[1]["content"], str) + assert "summary" in msgs[1]["content"] + + def test_skips_recording_when_no_model_id(self): + """Don't poison the cache with empty keys when provider/model is + unset (e.g. lazy-initialised mid-handshake).""" + agent = _make_agent(provider="", model="") + msgs = [ + {"role": "tool", "tool_call_id": "x", "content": [ + {"type": "text", "text": "summary"}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}}, + ]}, + ] + agent._try_strip_image_parts_from_tool_messages(msgs) + assert agent._no_list_tool_content_models == set() + + +# ─── Short-circuit on cached models ────────────────────────────────────────── + + +class TestToolResultContentShortCircuit: + """Once the session has learned that (provider, model) rejects list + content, ``_tool_result_content_for_active_model`` returns a text + summary even though ``_model_supports_vision`` reports True. + """ + + def _multimodal_result(self, png_b64: str = "iVBORw0KGgoAAAA"): + return { + "_multimodal": True, + "content": [ + {"type": "text", "text": "capture mode=som 800x600 app=Safari"}, + {"type": "image_url", + "image_url": {"url": f"data:image/png;base64,{png_b64}"}}, + ], + "text_summary": "capture mode=som 800x600 app=Safari", + "meta": {"mode": "som", "width": 800, "height": 600, "elements": 5, + "png_bytes": 1024}, + } + + def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch): + agent = _make_agent(provider="xiaomi", model="mimo-v2.5") + agent._no_list_tool_content_models = set() # explicit empty + monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) + out = agent._tool_result_content_for_active_model( + "computer_use", self._multimodal_result() + ) + # Native multimodal path: returns the content parts list. + assert isinstance(out, list) + assert any(p.get("type") == "image_url" for p in out) + + def test_returns_text_summary_when_model_in_cache(self, monkeypatch): + agent = _make_agent(provider="xiaomi", model="mimo-v2.5") + agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")} + monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) + out = agent._tool_result_content_for_active_model( + "computer_use", self._multimodal_result() + ) + # Short-circuit: a plain string summary, no image_url present. + assert isinstance(out, str) + assert "data:image" not in out + assert "image_url" not in out + + def test_cache_miss_on_different_model(self, monkeypatch): + """Cache is per (provider, model). A cached entry for mimo-v2.5 + must NOT affect a session running on a different model. + """ + agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro") + agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")} + monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) + out = agent._tool_result_content_for_active_model( + "computer_use", self._multimodal_result() + ) + assert isinstance(out, list) + + def test_missing_cache_attribute_falls_through(self, monkeypatch): + """Tests that build agents via ``object.__new__`` without calling + ``__init__`` must not crash — the cache attribute may be absent. + """ + agent = _make_agent() + # Deliberately do not assign _no_list_tool_content_models. + monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) + out = agent._tool_result_content_for_active_model( + "computer_use", self._multimodal_result() + ) + assert isinstance(out, list) + + +# ─── Classifier ────────────────────────────────────────────────────────────── + + +class TestRecoveryEndToEndClassification: + """Lock in that the patterns used by the recovery path classify to + the right ``FailoverReason``. (The recovery hook in + ``agent.conversation_loop`` consumes this reason directly.) + """ + + def test_xiaomi_mimo_classifies(self): + err = _FakeApiError( + status_code=400, + message=( + "Error code: 400 - {'error': {'code': '400', 'message': " + "'Param Incorrect', 'param': 'text is not set', 'type': ''}}" + ), + ) + result = classify_api_error(err, provider="xiaomi", model="mimo-v2.5") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported + assert result.retryable is True + + def test_alibaba_variant_classifies(self): + err = _FakeApiError( + status_code=400, + message="tool_call.content must be string", + ) + result = classify_api_error(err, provider="alibaba", model="qwen3.5-plus") + assert result.reason == FailoverReason.multimodal_tool_content_unsupported