fix(models): prefer image modalities for vision routing

2026-06-22 10:32:00 +00:00 · 2026-05-03 19:32:26 +08:00 · 2026-05-03 19:32:26 +08:00 · 14f38822fa
commit 14f38822fa
parent 6e46f99e7e
3 changed files with 39 additions and 6 deletions
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@ -381,14 +381,18 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit

    # Extract capability flags (default to False if missing)
    supports_tools = bool(entry.get("tool_call", False))
-    # Vision: check both the `attachment` flag and `modalities.input` for "image".
-    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
+    # Vision: prefer explicit `modalities.input` when models.dev provides it.
+    # The older `attachment` flag can be stale or too broad for image routing;
+    # fall back to it only when the input modalities are absent/invalid.
    input_mods = entry.get("modalities", {})
    if isinstance(input_mods, dict):
-        input_mods = input_mods.get("input", [])
+        input_mods = input_mods.get("input")
    else:
-        input_mods = []
-    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
+        input_mods = None
+    if isinstance(input_mods, list):
+        supports_vision = "image" in input_mods
+    else:
+        supports_vision = bool(entry.get("attachment", False))
    supports_reasoning = bool(entry.get("reasoning", False))

    # Extract limits
--- a/tests/agent/test_image_routing.py
+++ b/tests/agent/test_image_routing.py
@ -109,6 +109,21 @@ class TestDecideImageInputMode:
        with patch("agent.image_routing._lookup_supports_vision", return_value=True):
            assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"

+    def test_auto_uses_text_for_text_only_modalities_even_with_attachment_flag(self):
+        registry = {
+            "xiaomi": {
+                "models": {
+                    "mimo-v2.5-pro": {
+                        "attachment": True,
+                        "modalities": {"input": ["text"]},
+                        "tool_call": True,
+                    },
+                },
+            },
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=registry):
+            assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text"
+

 # ─── build_native_content_parts ──────────────────────────────────────────────

--- a/tests/agent/test_models_dev.py
+++ b/tests/agent/test_models_dev.py
@ -223,6 +223,13 @@ CAPS_REGISTRY = {
                "tool_call": True,
                "limit": {"context": 32000, "output": 8192},
            },
+            "text-only-with-stale-attachment": {
+                "id": "text-only-with-stale-attachment",
+                "attachment": True,
+                "tool_call": True,
+                "modalities": {"input": ["text"]},
+                "limit": {"context": 128000, "output": 8192},
+            },
        },
    },
    "anthropic": {
@ -243,7 +250,7 @@ class TestGetModelCapabilities:
    """Tests for get_model_capabilities vision detection."""

    def test_vision_from_attachment_flag(self):
-        """Models with attachment=True should report supports_vision=True."""
+        """Models with attachment=True and no modalities should report supports_vision=True."""
        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
            caps = get_model_capabilities("anthropic", "claude-sonnet-4")
        assert caps is not None
@ -257,6 +264,13 @@ class TestGetModelCapabilities:
        assert caps is not None
        assert caps.supports_vision is True

+    def test_text_only_modalities_override_stale_attachment_flag(self):
+        """Text-only modalities must win over stale attachment=True metadata."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "text-only-with-stale-attachment")
+        assert caps is not None
+        assert caps.supports_vision is False
+
    def test_no_vision_without_attachment_or_modalities(self):
        """Models with neither attachment nor image modality should be non-vision."""
        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):