From 14f38822fa56a740899afa1d0b1f2df8c90cb422 Mon Sep 17 00:00:00 2001 From: LeonSGP43 Date: Sun, 3 May 2026 19:32:26 +0800 Subject: [PATCH] fix(models): prefer image modalities for vision routing --- agent/models_dev.py | 14 +++++++++----- tests/agent/test_image_routing.py | 15 +++++++++++++++ tests/agent/test_models_dev.py | 16 +++++++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/agent/models_dev.py b/agent/models_dev.py index 79cfa90ca9..0ef18f4ce1 100644 --- a/agent/models_dev.py +++ b/agent/models_dev.py @@ -381,14 +381,18 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit # Extract capability flags (default to False if missing) supports_tools = bool(entry.get("tool_call", False)) - # Vision: check both the `attachment` flag and `modalities.input` for "image". - # Some models (e.g. gemma-4) list image in input modalities but not attachment. + # Vision: prefer explicit `modalities.input` when models.dev provides it. + # The older `attachment` flag can be stale or too broad for image routing; + # fall back to it only when the input modalities are absent/invalid. input_mods = entry.get("modalities", {}) if isinstance(input_mods, dict): - input_mods = input_mods.get("input", []) + input_mods = input_mods.get("input") else: - input_mods = [] - supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods + input_mods = None + if isinstance(input_mods, list): + supports_vision = "image" in input_mods + else: + supports_vision = bool(entry.get("attachment", False)) supports_reasoning = bool(entry.get("reasoning", False)) # Extract limits diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index aef7bbda65..89b19a6d42 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -109,6 +109,21 @@ class TestDecideImageInputMode: with patch("agent.image_routing._lookup_supports_vision", return_value=True): assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native" + def test_auto_uses_text_for_text_only_modalities_even_with_attachment_flag(self): + registry = { + "xiaomi": { + "models": { + "mimo-v2.5-pro": { + "attachment": True, + "modalities": {"input": ["text"]}, + "tool_call": True, + }, + }, + }, + } + with patch("agent.models_dev.fetch_models_dev", return_value=registry): + assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text" + # ─── build_native_content_parts ────────────────────────────────────────────── diff --git a/tests/agent/test_models_dev.py b/tests/agent/test_models_dev.py index c2a2140186..4eac2bd561 100644 --- a/tests/agent/test_models_dev.py +++ b/tests/agent/test_models_dev.py @@ -223,6 +223,13 @@ CAPS_REGISTRY = { "tool_call": True, "limit": {"context": 32000, "output": 8192}, }, + "text-only-with-stale-attachment": { + "id": "text-only-with-stale-attachment", + "attachment": True, + "tool_call": True, + "modalities": {"input": ["text"]}, + "limit": {"context": 128000, "output": 8192}, + }, }, }, "anthropic": { @@ -243,7 +250,7 @@ class TestGetModelCapabilities: """Tests for get_model_capabilities vision detection.""" def test_vision_from_attachment_flag(self): - """Models with attachment=True should report supports_vision=True.""" + """Models with attachment=True and no modalities should report supports_vision=True.""" with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): caps = get_model_capabilities("anthropic", "claude-sonnet-4") assert caps is not None @@ -257,6 +264,13 @@ class TestGetModelCapabilities: assert caps is not None assert caps.supports_vision is True + def test_text_only_modalities_override_stale_attachment_flag(self): + """Text-only modalities must win over stale attachment=True metadata.""" + with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY): + caps = get_model_capabilities("google", "text-only-with-stale-attachment") + assert caps is not None + assert caps.supports_vision is False + def test_no_vision_without_attachment_or_modalities(self): """Models with neither attachment nor image modality should be non-vision.""" with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):