mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(vision): proactive downgrade for providers rejecting list-type tool content (#41072)
Xiaomi MiMo (and potentially other providers) support multimodal user messages but reject list-type tool message content with 400 'text is not set'. Previously this was handled reactively — the API call would fail, images would be stripped, and the request retried, losing visual info. Fix: add supports_vision_tool_messages field to ProviderProfile (default True). Xiaomi sets it to False. _tool_result_content_for_active_model now checks this field proactively and returns a text summary instead of list content, avoiding the round-trip failure entirely.
This commit is contained in:
parent
41f0714287
commit
9513793ad7
5 changed files with 269 additions and 22 deletions
|
|
@ -10,6 +10,7 @@ xiaomi = ProviderProfile(
|
|||
base_url="https://api.xiaomimimo.com/v1",
|
||||
supports_health_check=False, # /v1/models returns 401 even with valid key
|
||||
supports_vision=True, # mimo-v2-omni is vision-capable
|
||||
supports_vision_tool_messages=False, # rejects list-type tool content (400 "text is not set")
|
||||
)
|
||||
|
||||
register_provider(xiaomi)
|
||||
|
|
|
|||
|
|
@ -60,11 +60,18 @@ class ProviderProfile:
|
|||
# True when the provider's API accepts image content inside
|
||||
# tool-result messages natively. Set on providers that expose
|
||||
# multimodal models via tool results (Anthropic Messages API,
|
||||
# OpenAI Chat Completions, Gemini, Xiaomi, MiniMax, etc.).
|
||||
# OpenAI Chat Completions, Gemini, MiniMax, etc.).
|
||||
# Falls back to model-catalog lookup when False and the provider
|
||||
# has no registered profile.
|
||||
supports_vision: bool = False
|
||||
|
||||
# True when the provider's API accepts list-type tool message
|
||||
# content (multipart with image_url parts). Defaults to True for
|
||||
# backward compatibility. Set to False for providers that accept
|
||||
# multimodal user messages but reject list-type tool content
|
||||
# (e.g. Xiaomi MiMo, which returns 400 "text is not set").
|
||||
supports_vision_tool_messages: bool = True
|
||||
|
||||
# ── Model catalog ─────────────────────────────────────────
|
||||
# fallback_models: curated list shown in /model picker when live fetch fails.
|
||||
# Only agentic models that support tool calling should appear here.
|
||||
|
|
|
|||
35
run_agent.py
35
run_agent.py
|
|
@ -4255,6 +4255,23 @@ class AIAgent:
|
|||
except Exception:
|
||||
return False
|
||||
|
||||
def _provider_supports_vision_tool_messages(self) -> bool:
|
||||
"""Return True if the active provider accepts list-type tool content.
|
||||
|
||||
Some providers (e.g. Xiaomi MiMo) support multimodal user messages
|
||||
but reject list-type tool message content with 400 errors. This
|
||||
checks the provider profile's ``supports_vision_tool_messages`` field.
|
||||
"""
|
||||
try:
|
||||
from providers import get_provider_profile
|
||||
provider = (getattr(self, "provider", "") or "").strip()
|
||||
profile = get_provider_profile(provider)
|
||||
if profile is not None:
|
||||
return getattr(profile, "supports_vision_tool_messages", True)
|
||||
except Exception:
|
||||
pass
|
||||
return True # default: assume compatible
|
||||
|
||||
def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
|
||||
if not self._content_has_image_parts(content):
|
||||
return content
|
||||
|
|
@ -4394,13 +4411,17 @@ class AIAgent:
|
|||
return content
|
||||
|
||||
if self._model_supports_vision():
|
||||
# Vision-capable on paper — but if we've already learned in this
|
||||
# session that the active (provider, model) rejects list-type
|
||||
# tool content (e.g. Xiaomi MiMo's 400 "text is not set"),
|
||||
# short-circuit to a text summary so we don't burn another
|
||||
# round-trip relearning the same lesson. Cache populated by
|
||||
# the 400 recovery path in agent.conversation_loop. Transient
|
||||
# per-session; next session retries.
|
||||
# Vision-capable on paper — but if the provider rejects list-type
|
||||
# tool content (e.g. Xiaomi MiMo's 400 "text is not set"), or if
|
||||
# we've already learned this lesson in-session, short-circuit to
|
||||
# a text summary so we don't burn a round-trip relearning it.
|
||||
if not self._provider_supports_vision_tool_messages():
|
||||
logger.debug(
|
||||
"Tool %s: provider %s does not accept list-type tool "
|
||||
"content — sending text summary",
|
||||
tool_name, getattr(self, "provider", ""),
|
||||
)
|
||||
return _multimodal_text_summary(result)
|
||||
key = (
|
||||
(getattr(self, "provider", "") or "").strip().lower(),
|
||||
(getattr(self, "model", "") or "").strip(),
|
||||
|
|
|
|||
|
|
@ -181,16 +181,20 @@ class TestToolResultContentShortCircuit:
|
|||
"png_bytes": 1024},
|
||||
}
|
||||
|
||||
def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch):
|
||||
def test_returns_text_summary_for_xiaomi_proactively(self, monkeypatch):
|
||||
"""Xiaomi MiMo rejects list-type tool content, so even with an
|
||||
empty cache, _tool_result_content_for_active_model should
|
||||
proactively downgrade to a text summary."""
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
agent._no_list_tool_content_models = set() # explicit empty
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
# Native multimodal path: returns the content parts list.
|
||||
assert isinstance(out, list)
|
||||
assert any(p.get("type") == "image_url" for p in out)
|
||||
# Proactive downgrade: text summary instead of list with images.
|
||||
assert isinstance(out, str)
|
||||
assert "data:image" not in out
|
||||
assert "image_url" not in out
|
||||
|
||||
def test_returns_text_summary_when_model_in_cache(self, monkeypatch):
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
|
|
@ -204,29 +208,31 @@ class TestToolResultContentShortCircuit:
|
|||
assert "data:image" not in out
|
||||
assert "image_url" not in out
|
||||
|
||||
def test_cache_miss_on_different_model(self, monkeypatch):
|
||||
"""Cache is per (provider, model). A cached entry for mimo-v2.5
|
||||
must NOT affect a session running on a different model.
|
||||
"""
|
||||
def test_xiaomi_any_model_gets_text_summary(self, monkeypatch):
|
||||
"""All Xiaomi models reject list-type tool content, so even a
|
||||
different model on the same provider gets a text summary."""
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro")
|
||||
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
assert isinstance(out, list)
|
||||
assert isinstance(out, str)
|
||||
assert "data:image" not in out
|
||||
|
||||
def test_missing_cache_attribute_falls_through(self, monkeypatch):
|
||||
"""Tests that build agents via ``object.__new__`` without calling
|
||||
``__init__`` must not crash — the cache attribute may be absent.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
"""Agents built via ``object.__new__`` without calling ``__init__``
|
||||
must not crash — the cache attribute may be absent. Xiaomi still
|
||||
gets a text summary because the provider profile says so."""
|
||||
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
||||
# Deliberately do not assign _no_list_tool_content_models.
|
||||
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
||||
out = agent._tool_result_content_for_active_model(
|
||||
"computer_use", self._multimodal_result()
|
||||
)
|
||||
assert isinstance(out, list)
|
||||
# Xiaomi proactively downgrades regardless of cache state.
|
||||
assert isinstance(out, str)
|
||||
assert "data:image" not in out
|
||||
|
||||
|
||||
# ─── Classifier ──────────────────────────────────────────────────────────────
|
||||
|
|
|
|||
212
tests/run_agent/test_vision_tool_messages.py
Normal file
212
tests/run_agent/test_vision_tool_messages.py
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
"""Tests for proactive vision-tool-message downgrade (issue #41072).
|
||||
|
||||
When a provider supports vision in user messages but rejects list-type
|
||||
tool message content (e.g. Xiaomi MiMo's 400 "text is not set"),
|
||||
``_tool_result_content_for_active_model`` should proactively downgrade
|
||||
to a text summary instead of waiting for a reactive 400 recovery.
|
||||
|
||||
The fix adds ``supports_vision_tool_messages`` to ``ProviderProfile``
|
||||
and checks it in ``_tool_result_content_for_active_model``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_agent(provider="openrouter", model="gpt-4o"):
|
||||
"""Create a minimal AIAgent mock with provider/model attributes."""
|
||||
from run_agent import AIAgent
|
||||
agent = MagicMock(spec=AIAgent)
|
||||
agent.provider = provider
|
||||
agent.model = model
|
||||
agent._no_list_tool_content_models = set()
|
||||
|
||||
def _real_content_has_image_parts(content):
|
||||
if not isinstance(content, list):
|
||||
return False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
agent._content_has_image_parts = _real_content_has_image_parts
|
||||
agent._model_supports_vision = lambda: AIAgent._model_supports_vision(agent)
|
||||
agent._provider_supports_vision_tool_messages = lambda: AIAgent._provider_supports_vision_tool_messages(agent)
|
||||
agent._tool_result_content_for_active_model = (
|
||||
lambda name, result: AIAgent._tool_result_content_for_active_model(agent, name, result)
|
||||
)
|
||||
return agent
|
||||
|
||||
|
||||
def _multimodal_result(text="screenshot", image_url="data:image/png;base64,AAAA"):
|
||||
return {
|
||||
"_multimodal": True,
|
||||
"content": [
|
||||
{"type": "text", "text": text},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
"text_summary": text,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _provider_supports_vision_tool_messages
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProviderSupportsVisionToolMessages:
|
||||
def test_xiaomi_returns_false(self):
|
||||
agent = _make_agent("xiaomi", "mimo-v2.5")
|
||||
assert agent._provider_supports_vision_tool_messages() is False
|
||||
|
||||
def test_xiaomi_alias_mimo_returns_false(self):
|
||||
agent = _make_agent("mimo", "mimo-v2.5")
|
||||
assert agent._provider_supports_vision_tool_messages() is False
|
||||
|
||||
def test_unknown_provider_defaults_true(self):
|
||||
agent = _make_agent("some-unknown-provider", "model-v1")
|
||||
assert agent._provider_supports_vision_tool_messages() is True
|
||||
|
||||
def test_openrouter_defaults_true(self):
|
||||
agent = _make_agent("openrouter", "gpt-4o")
|
||||
assert agent._provider_supports_vision_tool_messages() is True
|
||||
|
||||
def test_anthropic_defaults_true(self):
|
||||
agent = _make_agent("anthropic", "claude-sonnet-4")
|
||||
assert agent._provider_supports_vision_tool_messages() is True
|
||||
|
||||
def test_empty_provider_defaults_true(self):
|
||||
agent = _make_agent("", "")
|
||||
assert agent._provider_supports_vision_tool_messages() is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _tool_result_content_for_active_model — proactive downgrade
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestToolResultContentProactiveDowngrade:
|
||||
def test_xiaomi_downgrades_to_text_summary(self):
|
||||
"""Xiaomi: vision=True but supports_vision_tool_messages=False → text."""
|
||||
agent = _make_agent("xiaomi", "mimo-v2.5")
|
||||
result = _multimodal_result(text="screenshot captured")
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||
content = agent._tool_result_content_for_active_model("browser_screenshot", result)
|
||||
|
||||
assert isinstance(content, str)
|
||||
assert "screenshot captured" in content
|
||||
|
||||
def test_xiaomi_non_multimodal_passes_through(self):
|
||||
"""Non-multimodal results should pass through unchanged."""
|
||||
agent = _make_agent("xiaomi", "mimo-v2.5")
|
||||
result = "plain text result"
|
||||
|
||||
content = agent._tool_result_content_for_active_model("some_tool", result)
|
||||
|
||||
assert content == "plain text result"
|
||||
|
||||
def test_openrouter_vision_keeps_list_content(self):
|
||||
"""OpenRouter with vision: list content preserved."""
|
||||
agent = _make_agent("openrouter", "gpt-4o")
|
||||
result = _multimodal_result()
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||
content = agent._tool_result_content_for_active_model("browser_screenshot", result)
|
||||
|
||||
assert isinstance(content, list)
|
||||
assert any(p.get("type") == "image_url" for p in content if isinstance(p, dict))
|
||||
|
||||
def test_non_vision_model_gets_text_summary(self):
|
||||
"""Non-vision model: text summary regardless of provider."""
|
||||
agent = _make_agent("openrouter", "gpt-3.5-turbo")
|
||||
result = _multimodal_result(text="screenshot")
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=False):
|
||||
content = agent._tool_result_content_for_active_model("browser_screenshot", result)
|
||||
|
||||
assert isinstance(content, str)
|
||||
assert "screenshot" in content
|
||||
|
||||
def test_xiaomi_computer_use_gets_text_summary(self):
|
||||
"""Xiaomi + computer_use: text summary (not the error dict)."""
|
||||
agent = _make_agent("xiaomi", "mimo-v2.5")
|
||||
result = _multimodal_result(text="desktop screenshot")
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||
content = agent._tool_result_content_for_active_model("computer_use", result)
|
||||
|
||||
# Should be a text summary, not the error dict for non-vision models
|
||||
assert isinstance(content, str)
|
||||
assert "desktop screenshot" in content
|
||||
|
||||
def test_xiaomi_no_image_parts_returns_content(self):
|
||||
"""Xiaomi tool result with no image parts: returns content list."""
|
||||
agent = _make_agent("xiaomi", "mimo-v2.5")
|
||||
result = {
|
||||
"_multimodal": True,
|
||||
"content": [{"type": "text", "text": "just text"}],
|
||||
}
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||
content = agent._tool_result_content_for_active_model("some_tool", result)
|
||||
|
||||
# No image parts → returns content as-is
|
||||
assert isinstance(content, list)
|
||||
|
||||
def test_reactive_cache_still_works(self):
|
||||
"""In-session cache (_no_list_tool_content_models) still triggers."""
|
||||
agent = _make_agent("openrouter", "some-model")
|
||||
agent._no_list_tool_content_models = {("openrouter", "some-model")}
|
||||
result = _multimodal_result(text="cached downgrade")
|
||||
|
||||
with patch.object(agent, "_model_supports_vision", return_value=True):
|
||||
content = agent._tool_result_content_for_active_model("browser_screenshot", result)
|
||||
|
||||
assert isinstance(content, str)
|
||||
assert "cached downgrade" in content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ProviderProfile.supports_vision_tool_messages field
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestProviderProfileField:
|
||||
def test_default_is_true(self):
|
||||
from providers.base import ProviderProfile
|
||||
# ProviderProfile uses __init__ with defaults; check via a minimal instance
|
||||
# by reading the class-level default from a dataclass-like field
|
||||
import dataclasses
|
||||
if dataclasses.is_dataclass(ProviderProfile):
|
||||
fields = {f.name: f.default for f in dataclasses.fields(ProviderProfile)}
|
||||
assert fields.get("supports_vision_tool_messages", True) is True
|
||||
else:
|
||||
# Class-level attribute default
|
||||
assert getattr(ProviderProfile, "supports_vision_tool_messages", True) is True
|
||||
|
||||
def test_xiaomi_profile_has_false(self):
|
||||
from providers import get_provider_profile
|
||||
profile = get_provider_profile("xiaomi")
|
||||
assert profile is not None
|
||||
assert profile.supports_vision_tool_messages is False
|
||||
|
||||
def test_xiaomi_alias_mimo_has_false(self):
|
||||
from providers import get_provider_profile
|
||||
profile = get_provider_profile("mimo")
|
||||
assert profile is not None
|
||||
assert profile.supports_vision_tool_messages is False
|
||||
|
||||
def test_anthropic_profile_defaults_true(self):
|
||||
from providers import get_provider_profile
|
||||
profile = get_provider_profile("anthropic")
|
||||
if profile is not None:
|
||||
assert profile.supports_vision_tool_messages is True
|
||||
Loading…
Add table
Add a link
Reference in a new issue