From 9513793ad7832ef0d2d6c7359eb27d57238b5934 Mon Sep 17 00:00:00 2001 From: islam666 Date: Sun, 7 Jun 2026 08:34:45 +0000 Subject: [PATCH] fix(vision): proactive downgrade for providers rejecting list-type tool content (#41072) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xiaomi MiMo (and potentially other providers) support multimodal user messages but reject list-type tool message content with 400 'text is not set'. Previously this was handled reactively — the API call would fail, images would be stripped, and the request retried, losing visual info. Fix: add supports_vision_tool_messages field to ProviderProfile (default True). Xiaomi sets it to False. _tool_result_content_for_active_model now checks this field proactively and returns a text summary instead of list content, avoiding the round-trip failure entirely. --- plugins/model-providers/xiaomi/__init__.py | 1 + providers/base.py | 9 +- run_agent.py | 35 ++- .../test_multimodal_tool_content_recovery.py | 34 +-- tests/run_agent/test_vision_tool_messages.py | 212 ++++++++++++++++++ 5 files changed, 269 insertions(+), 22 deletions(-) create mode 100644 tests/run_agent/test_vision_tool_messages.py diff --git a/plugins/model-providers/xiaomi/__init__.py b/plugins/model-providers/xiaomi/__init__.py index 93c7dbb29e5..8cd378d7609 100644 --- a/plugins/model-providers/xiaomi/__init__.py +++ b/plugins/model-providers/xiaomi/__init__.py @@ -10,6 +10,7 @@ xiaomi = ProviderProfile( base_url="https://api.xiaomimimo.com/v1", supports_health_check=False, # /v1/models returns 401 even with valid key supports_vision=True, # mimo-v2-omni is vision-capable + supports_vision_tool_messages=False, # rejects list-type tool content (400 "text is not set") ) register_provider(xiaomi) diff --git a/providers/base.py b/providers/base.py index d7ff470d891..07100a3b52a 100644 --- a/providers/base.py +++ b/providers/base.py @@ -60,11 +60,18 @@ class ProviderProfile: # True when the provider's API accepts image content inside # tool-result messages natively. Set on providers that expose # multimodal models via tool results (Anthropic Messages API, - # OpenAI Chat Completions, Gemini, Xiaomi, MiniMax, etc.). + # OpenAI Chat Completions, Gemini, MiniMax, etc.). # Falls back to model-catalog lookup when False and the provider # has no registered profile. supports_vision: bool = False + # True when the provider's API accepts list-type tool message + # content (multipart with image_url parts). Defaults to True for + # backward compatibility. Set to False for providers that accept + # multimodal user messages but reject list-type tool content + # (e.g. Xiaomi MiMo, which returns 400 "text is not set"). + supports_vision_tool_messages: bool = True + # ── Model catalog ───────────────────────────────────────── # fallback_models: curated list shown in /model picker when live fetch fails. # Only agentic models that support tool calling should appear here. diff --git a/run_agent.py b/run_agent.py index 81ce106428b..c6cc1e21581 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4255,6 +4255,23 @@ class AIAgent: except Exception: return False + def _provider_supports_vision_tool_messages(self) -> bool: + """Return True if the active provider accepts list-type tool content. + + Some providers (e.g. Xiaomi MiMo) support multimodal user messages + but reject list-type tool message content with 400 errors. This + checks the provider profile's ``supports_vision_tool_messages`` field. + """ + try: + from providers import get_provider_profile + provider = (getattr(self, "provider", "") or "").strip() + profile = get_provider_profile(provider) + if profile is not None: + return getattr(profile, "supports_vision_tool_messages", True) + except Exception: + pass + return True # default: assume compatible + def _preprocess_anthropic_content(self, content: Any, role: str) -> Any: if not self._content_has_image_parts(content): return content @@ -4394,13 +4411,17 @@ class AIAgent: return content if self._model_supports_vision(): - # Vision-capable on paper — but if we've already learned in this - # session that the active (provider, model) rejects list-type - # tool content (e.g. Xiaomi MiMo's 400 "text is not set"), - # short-circuit to a text summary so we don't burn another - # round-trip relearning the same lesson. Cache populated by - # the 400 recovery path in agent.conversation_loop. Transient - # per-session; next session retries. + # Vision-capable on paper — but if the provider rejects list-type + # tool content (e.g. Xiaomi MiMo's 400 "text is not set"), or if + # we've already learned this lesson in-session, short-circuit to + # a text summary so we don't burn a round-trip relearning it. + if not self._provider_supports_vision_tool_messages(): + logger.debug( + "Tool %s: provider %s does not accept list-type tool " + "content — sending text summary", + tool_name, getattr(self, "provider", ""), + ) + return _multimodal_text_summary(result) key = ( (getattr(self, "provider", "") or "").strip().lower(), (getattr(self, "model", "") or "").strip(), diff --git a/tests/run_agent/test_multimodal_tool_content_recovery.py b/tests/run_agent/test_multimodal_tool_content_recovery.py index 0d9deef9394..a33a2a1a7b0 100644 --- a/tests/run_agent/test_multimodal_tool_content_recovery.py +++ b/tests/run_agent/test_multimodal_tool_content_recovery.py @@ -181,16 +181,20 @@ class TestToolResultContentShortCircuit: "png_bytes": 1024}, } - def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch): + def test_returns_text_summary_for_xiaomi_proactively(self, monkeypatch): + """Xiaomi MiMo rejects list-type tool content, so even with an + empty cache, _tool_result_content_for_active_model should + proactively downgrade to a text summary.""" agent = _make_agent(provider="xiaomi", model="mimo-v2.5") agent._no_list_tool_content_models = set() # explicit empty monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) out = agent._tool_result_content_for_active_model( "computer_use", self._multimodal_result() ) - # Native multimodal path: returns the content parts list. - assert isinstance(out, list) - assert any(p.get("type") == "image_url" for p in out) + # Proactive downgrade: text summary instead of list with images. + assert isinstance(out, str) + assert "data:image" not in out + assert "image_url" not in out def test_returns_text_summary_when_model_in_cache(self, monkeypatch): agent = _make_agent(provider="xiaomi", model="mimo-v2.5") @@ -204,29 +208,31 @@ class TestToolResultContentShortCircuit: assert "data:image" not in out assert "image_url" not in out - def test_cache_miss_on_different_model(self, monkeypatch): - """Cache is per (provider, model). A cached entry for mimo-v2.5 - must NOT affect a session running on a different model. - """ + def test_xiaomi_any_model_gets_text_summary(self, monkeypatch): + """All Xiaomi models reject list-type tool content, so even a + different model on the same provider gets a text summary.""" agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro") agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")} monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) out = agent._tool_result_content_for_active_model( "computer_use", self._multimodal_result() ) - assert isinstance(out, list) + assert isinstance(out, str) + assert "data:image" not in out def test_missing_cache_attribute_falls_through(self, monkeypatch): - """Tests that build agents via ``object.__new__`` without calling - ``__init__`` must not crash — the cache attribute may be absent. - """ - agent = _make_agent() + """Agents built via ``object.__new__`` without calling ``__init__`` + must not crash — the cache attribute may be absent. Xiaomi still + gets a text summary because the provider profile says so.""" + agent = _make_agent(provider="xiaomi", model="mimo-v2.5") # Deliberately do not assign _no_list_tool_content_models. monkeypatch.setattr(agent, "_model_supports_vision", lambda: True) out = agent._tool_result_content_for_active_model( "computer_use", self._multimodal_result() ) - assert isinstance(out, list) + # Xiaomi proactively downgrades regardless of cache state. + assert isinstance(out, str) + assert "data:image" not in out # ─── Classifier ────────────────────────────────────────────────────────────── diff --git a/tests/run_agent/test_vision_tool_messages.py b/tests/run_agent/test_vision_tool_messages.py new file mode 100644 index 00000000000..9417fdeaf11 --- /dev/null +++ b/tests/run_agent/test_vision_tool_messages.py @@ -0,0 +1,212 @@ +"""Tests for proactive vision-tool-message downgrade (issue #41072). + +When a provider supports vision in user messages but rejects list-type +tool message content (e.g. Xiaomi MiMo's 400 "text is not set"), +``_tool_result_content_for_active_model`` should proactively downgrade +to a text summary instead of waiting for a reactive 400 recovery. + +The fix adds ``supports_vision_tool_messages`` to ``ProviderProfile`` +and checks it in ``_tool_result_content_for_active_model``. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_agent(provider="openrouter", model="gpt-4o"): + """Create a minimal AIAgent mock with provider/model attributes.""" + from run_agent import AIAgent + agent = MagicMock(spec=AIAgent) + agent.provider = provider + agent.model = model + agent._no_list_tool_content_models = set() + + def _real_content_has_image_parts(content): + if not isinstance(content, list): + return False + for part in content: + if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}: + return True + return False + + agent._content_has_image_parts = _real_content_has_image_parts + agent._model_supports_vision = lambda: AIAgent._model_supports_vision(agent) + agent._provider_supports_vision_tool_messages = lambda: AIAgent._provider_supports_vision_tool_messages(agent) + agent._tool_result_content_for_active_model = ( + lambda name, result: AIAgent._tool_result_content_for_active_model(agent, name, result) + ) + return agent + + +def _multimodal_result(text="screenshot", image_url="data:image/png;base64,AAAA"): + return { + "_multimodal": True, + "content": [ + {"type": "text", "text": text}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + "text_summary": text, + } + + +# --------------------------------------------------------------------------- +# _provider_supports_vision_tool_messages +# --------------------------------------------------------------------------- + + +class TestProviderSupportsVisionToolMessages: + def test_xiaomi_returns_false(self): + agent = _make_agent("xiaomi", "mimo-v2.5") + assert agent._provider_supports_vision_tool_messages() is False + + def test_xiaomi_alias_mimo_returns_false(self): + agent = _make_agent("mimo", "mimo-v2.5") + assert agent._provider_supports_vision_tool_messages() is False + + def test_unknown_provider_defaults_true(self): + agent = _make_agent("some-unknown-provider", "model-v1") + assert agent._provider_supports_vision_tool_messages() is True + + def test_openrouter_defaults_true(self): + agent = _make_agent("openrouter", "gpt-4o") + assert agent._provider_supports_vision_tool_messages() is True + + def test_anthropic_defaults_true(self): + agent = _make_agent("anthropic", "claude-sonnet-4") + assert agent._provider_supports_vision_tool_messages() is True + + def test_empty_provider_defaults_true(self): + agent = _make_agent("", "") + assert agent._provider_supports_vision_tool_messages() is True + + +# --------------------------------------------------------------------------- +# _tool_result_content_for_active_model — proactive downgrade +# --------------------------------------------------------------------------- + + +class TestToolResultContentProactiveDowngrade: + def test_xiaomi_downgrades_to_text_summary(self): + """Xiaomi: vision=True but supports_vision_tool_messages=False → text.""" + agent = _make_agent("xiaomi", "mimo-v2.5") + result = _multimodal_result(text="screenshot captured") + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("browser_screenshot", result) + + assert isinstance(content, str) + assert "screenshot captured" in content + + def test_xiaomi_non_multimodal_passes_through(self): + """Non-multimodal results should pass through unchanged.""" + agent = _make_agent("xiaomi", "mimo-v2.5") + result = "plain text result" + + content = agent._tool_result_content_for_active_model("some_tool", result) + + assert content == "plain text result" + + def test_openrouter_vision_keeps_list_content(self): + """OpenRouter with vision: list content preserved.""" + agent = _make_agent("openrouter", "gpt-4o") + result = _multimodal_result() + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("browser_screenshot", result) + + assert isinstance(content, list) + assert any(p.get("type") == "image_url" for p in content if isinstance(p, dict)) + + def test_non_vision_model_gets_text_summary(self): + """Non-vision model: text summary regardless of provider.""" + agent = _make_agent("openrouter", "gpt-3.5-turbo") + result = _multimodal_result(text="screenshot") + + with patch.object(agent, "_model_supports_vision", return_value=False): + content = agent._tool_result_content_for_active_model("browser_screenshot", result) + + assert isinstance(content, str) + assert "screenshot" in content + + def test_xiaomi_computer_use_gets_text_summary(self): + """Xiaomi + computer_use: text summary (not the error dict).""" + agent = _make_agent("xiaomi", "mimo-v2.5") + result = _multimodal_result(text="desktop screenshot") + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("computer_use", result) + + # Should be a text summary, not the error dict for non-vision models + assert isinstance(content, str) + assert "desktop screenshot" in content + + def test_xiaomi_no_image_parts_returns_content(self): + """Xiaomi tool result with no image parts: returns content list.""" + agent = _make_agent("xiaomi", "mimo-v2.5") + result = { + "_multimodal": True, + "content": [{"type": "text", "text": "just text"}], + } + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("some_tool", result) + + # No image parts → returns content as-is + assert isinstance(content, list) + + def test_reactive_cache_still_works(self): + """In-session cache (_no_list_tool_content_models) still triggers.""" + agent = _make_agent("openrouter", "some-model") + agent._no_list_tool_content_models = {("openrouter", "some-model")} + result = _multimodal_result(text="cached downgrade") + + with patch.object(agent, "_model_supports_vision", return_value=True): + content = agent._tool_result_content_for_active_model("browser_screenshot", result) + + assert isinstance(content, str) + assert "cached downgrade" in content + + +# --------------------------------------------------------------------------- +# ProviderProfile.supports_vision_tool_messages field +# --------------------------------------------------------------------------- + + +class TestProviderProfileField: + def test_default_is_true(self): + from providers.base import ProviderProfile + # ProviderProfile uses __init__ with defaults; check via a minimal instance + # by reading the class-level default from a dataclass-like field + import dataclasses + if dataclasses.is_dataclass(ProviderProfile): + fields = {f.name: f.default for f in dataclasses.fields(ProviderProfile)} + assert fields.get("supports_vision_tool_messages", True) is True + else: + # Class-level attribute default + assert getattr(ProviderProfile, "supports_vision_tool_messages", True) is True + + def test_xiaomi_profile_has_false(self): + from providers import get_provider_profile + profile = get_provider_profile("xiaomi") + assert profile is not None + assert profile.supports_vision_tool_messages is False + + def test_xiaomi_alias_mimo_has_false(self): + from providers import get_provider_profile + profile = get_provider_profile("mimo") + assert profile is not None + assert profile.supports_vision_tool_messages is False + + def test_anthropic_profile_defaults_true(self): + from providers import get_provider_profile + profile = get_provider_profile("anthropic") + if profile is not None: + assert profile.supports_vision_tool_messages is True