mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-31 06:51:29 +00:00
Some providers (Xiaomi MiMo, some Alibaba endpoints, a long tail of OpenAI-compatible servers) follow the OpenAI spec strictly and require tool message `content` to be a string — they reject our list-type content (text + image_url parts) with HTTP 400 'text is not set' / 'tool message content must be a string'. Instead of an allowlist of known-good providers (maintenance burden, guaranteed to miss aggregators like OpenRouter where the underlying model determines support, not the aggregator name), this lands a reactive recovery: 1. New `FailoverReason.multimodal_tool_content_unsupported` with a small pattern list covering the common 400 wordings. 2. `AIAgent._try_strip_image_parts_from_tool_messages` walks the API message list, downgrades any `role:tool` message whose content is list-with-image to a plain text summary (preserves text parts) in place, AND records the active (provider, model) in a session-scoped `_no_list_tool_content_models` set. 3. `_tool_result_content_for_active_model` short-circuits to a text summary when (provider, model) is in the cache — so after the first 400 + retry, subsequent screenshots in the same session skip the round trip entirely. 4. Retry hook in `agent.conversation_loop` mirrors the existing `image_too_large` recovery: detect the reason, run the helper, retry once, fall through to the normal error path if no list-type tool content was actually present. Cache is transient (per-session) by design — next session retries in case the provider added support, no persistent state to maintain. Fixes #27344. Closes #27351 (allowlist approach superseded by reactive recovery).
260 lines
11 KiB
Python
260 lines
11 KiB
Python
"""Tests for reactive multimodal-tool-content recovery.
|
|
|
|
Covers the full chain for providers that reject list-type content in
|
|
``role: "tool"`` messages (Xiaomi MiMo's 400 "text is not set", etc.):
|
|
|
|
1. agent/error_classifier.py: 400 with the right wording classifies as
|
|
``FailoverReason.multimodal_tool_content_unsupported``.
|
|
2. run_agent._try_strip_image_parts_from_tool_messages downgrades tool
|
|
messages whose ``content`` is a list-with-image to a string text
|
|
summary, in-place, and records the active (provider, model) in
|
|
``self._no_list_tool_content_models`` so future tool results in this
|
|
session preemptively downgrade.
|
|
3. run_agent._tool_result_content_for_active_model short-circuits to a
|
|
text summary when the (provider, model) is in the cache, even though
|
|
``_model_supports_vision`` returns True — avoiding a wasted round
|
|
trip on every subsequent screenshot in the session.
|
|
|
|
The end-to-end retry loop wiring (`conversation_loop.py`) is exercised by
|
|
the classifier signal + helper-mutation tests; the integration only adds
|
|
a trivial flag-and-continue around the existing pattern used for
|
|
``image_too_large`` recovery.
|
|
|
|
See: https://github.com/NousResearch/hermes-agent/issues/27344
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from agent.error_classifier import FailoverReason, classify_api_error
|
|
|
|
|
|
class _FakeApiError(Exception):
|
|
"""Stand-in for an openai.BadRequestError with status_code + body."""
|
|
|
|
def __init__(self, status_code: int, message: str, body: dict | None = None):
|
|
super().__init__(message)
|
|
self.status_code = status_code
|
|
self.body = body or {"error": {"message": message}}
|
|
self.response = None
|
|
|
|
|
|
def _make_agent(provider: str = "xiaomi", model: str = "mimo-v2.5"):
|
|
"""Build a bare AIAgent for method-level testing, no provider setup."""
|
|
from run_agent import AIAgent
|
|
agent = object.__new__(AIAgent)
|
|
agent.provider = provider
|
|
agent.model = model
|
|
return agent
|
|
|
|
|
|
# ─── Strip helper ────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestStripImagePartsHelper:
|
|
def test_no_messages_returns_false(self):
|
|
agent = _make_agent()
|
|
assert agent._try_strip_image_parts_from_tool_messages([]) is False
|
|
assert agent._try_strip_image_parts_from_tool_messages(None) is False
|
|
|
|
def test_no_tool_messages_returns_false(self):
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "user", "content": "plain text"},
|
|
{"role": "assistant", "content": "ack"},
|
|
]
|
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
|
|
|
def test_tool_message_with_string_content_unchanged(self):
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": "plain string result"},
|
|
]
|
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
|
assert msgs[0]["content"] == "plain string result"
|
|
|
|
def test_tool_message_list_without_image_unchanged(self):
|
|
"""List content with only text parts is left alone — caller surfaces
|
|
the original error if this turns out to also be rejected."""
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "text", "text": "hello"},
|
|
]},
|
|
]
|
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is False
|
|
|
|
def test_tool_message_list_with_image_downgrades(self):
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "text", "text": "AX summary: 5 buttons visible"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
|
]},
|
|
]
|
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
|
# Image stripped; text preserved as a string.
|
|
assert isinstance(msgs[0]["content"], str)
|
|
assert "AX summary" in msgs[0]["content"]
|
|
assert "image_url" not in msgs[0]["content"]
|
|
assert "iVBOR" not in msgs[0]["content"]
|
|
|
|
def test_tool_message_image_only_gets_placeholder(self):
|
|
"""If the list had nothing but image parts, leave a placeholder so
|
|
the assistant message has something to reference."""
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR..."}},
|
|
]},
|
|
]
|
|
assert agent._try_strip_image_parts_from_tool_messages(msgs) is True
|
|
assert isinstance(msgs[0]["content"], str)
|
|
assert "image content removed" in msgs[0]["content"]
|
|
|
|
def test_records_provider_model_in_session_cache(self):
|
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "text", "text": "summary"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
|
]},
|
|
]
|
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
|
assert ("xiaomi", "mimo-v2.5") in agent._no_list_tool_content_models
|
|
|
|
def test_only_tool_messages_get_downgraded(self):
|
|
"""User / assistant messages with list-type content are out of
|
|
scope — they're handled by the existing image-routing path."""
|
|
agent = _make_agent()
|
|
msgs = [
|
|
{"role": "user", "content": [
|
|
{"type": "text", "text": "describe"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
|
]},
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "text", "text": "summary"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,Y"}},
|
|
]},
|
|
]
|
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
|
# User message untouched.
|
|
assert isinstance(msgs[0]["content"], list)
|
|
assert any(p.get("type") == "image_url" for p in msgs[0]["content"])
|
|
# Tool message downgraded.
|
|
assert isinstance(msgs[1]["content"], str)
|
|
assert "summary" in msgs[1]["content"]
|
|
|
|
def test_skips_recording_when_no_model_id(self):
|
|
"""Don't poison the cache with empty keys when provider/model is
|
|
unset (e.g. lazy-initialised mid-handshake)."""
|
|
agent = _make_agent(provider="", model="")
|
|
msgs = [
|
|
{"role": "tool", "tool_call_id": "x", "content": [
|
|
{"type": "text", "text": "summary"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,X"}},
|
|
]},
|
|
]
|
|
agent._try_strip_image_parts_from_tool_messages(msgs)
|
|
assert agent._no_list_tool_content_models == set()
|
|
|
|
|
|
# ─── Short-circuit on cached models ──────────────────────────────────────────
|
|
|
|
|
|
class TestToolResultContentShortCircuit:
|
|
"""Once the session has learned that (provider, model) rejects list
|
|
content, ``_tool_result_content_for_active_model`` returns a text
|
|
summary even though ``_model_supports_vision`` reports True.
|
|
"""
|
|
|
|
def _multimodal_result(self, png_b64: str = "iVBORw0KGgoAAAA"):
|
|
return {
|
|
"_multimodal": True,
|
|
"content": [
|
|
{"type": "text", "text": "capture mode=som 800x600 app=Safari"},
|
|
{"type": "image_url",
|
|
"image_url": {"url": f"data:image/png;base64,{png_b64}"}},
|
|
],
|
|
"text_summary": "capture mode=som 800x600 app=Safari",
|
|
"meta": {"mode": "som", "width": 800, "height": 600, "elements": 5,
|
|
"png_bytes": 1024},
|
|
}
|
|
|
|
def test_returns_list_when_cache_empty_and_vision_supported(self, monkeypatch):
|
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
|
agent._no_list_tool_content_models = set() # explicit empty
|
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
|
out = agent._tool_result_content_for_active_model(
|
|
"computer_use", self._multimodal_result()
|
|
)
|
|
# Native multimodal path: returns the content parts list.
|
|
assert isinstance(out, list)
|
|
assert any(p.get("type") == "image_url" for p in out)
|
|
|
|
def test_returns_text_summary_when_model_in_cache(self, monkeypatch):
|
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5")
|
|
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
|
out = agent._tool_result_content_for_active_model(
|
|
"computer_use", self._multimodal_result()
|
|
)
|
|
# Short-circuit: a plain string summary, no image_url present.
|
|
assert isinstance(out, str)
|
|
assert "data:image" not in out
|
|
assert "image_url" not in out
|
|
|
|
def test_cache_miss_on_different_model(self, monkeypatch):
|
|
"""Cache is per (provider, model). A cached entry for mimo-v2.5
|
|
must NOT affect a session running on a different model.
|
|
"""
|
|
agent = _make_agent(provider="xiaomi", model="mimo-v2.5-pro")
|
|
agent._no_list_tool_content_models = {("xiaomi", "mimo-v2.5")}
|
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
|
out = agent._tool_result_content_for_active_model(
|
|
"computer_use", self._multimodal_result()
|
|
)
|
|
assert isinstance(out, list)
|
|
|
|
def test_missing_cache_attribute_falls_through(self, monkeypatch):
|
|
"""Tests that build agents via ``object.__new__`` without calling
|
|
``__init__`` must not crash — the cache attribute may be absent.
|
|
"""
|
|
agent = _make_agent()
|
|
# Deliberately do not assign _no_list_tool_content_models.
|
|
monkeypatch.setattr(agent, "_model_supports_vision", lambda: True)
|
|
out = agent._tool_result_content_for_active_model(
|
|
"computer_use", self._multimodal_result()
|
|
)
|
|
assert isinstance(out, list)
|
|
|
|
|
|
# ─── Classifier ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestRecoveryEndToEndClassification:
|
|
"""Lock in that the patterns used by the recovery path classify to
|
|
the right ``FailoverReason``. (The recovery hook in
|
|
``agent.conversation_loop`` consumes this reason directly.)
|
|
"""
|
|
|
|
def test_xiaomi_mimo_classifies(self):
|
|
err = _FakeApiError(
|
|
status_code=400,
|
|
message=(
|
|
"Error code: 400 - {'error': {'code': '400', 'message': "
|
|
"'Param Incorrect', 'param': 'text is not set', 'type': ''}}"
|
|
),
|
|
)
|
|
result = classify_api_error(err, provider="xiaomi", model="mimo-v2.5")
|
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|
|
assert result.retryable is True
|
|
|
|
def test_alibaba_variant_classifies(self):
|
|
err = _FakeApiError(
|
|
status_code=400,
|
|
message="tool_call.content must be string",
|
|
)
|
|
result = classify_api_error(err, provider="alibaba", model="qwen3.5-plus")
|
|
assert result.reason == FailoverReason.multimodal_tool_content_unsupported
|