From 14f38822fa56a740899afa1d0b1f2df8c90cb422 Mon Sep 17 00:00:00 2001
From: LeonSGP43 <cine.dreamer.one@gmail.com>
Date: Sun, 3 May 2026 19:32:26 +0800
Subject: [PATCH] fix(models): prefer image modalities for vision routing

---
 agent/models_dev.py               | 14 +++++++++-----
 tests/agent/test_image_routing.py | 15 +++++++++++++++
 tests/agent/test_models_dev.py    | 16 +++++++++++++++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/agent/models_dev.py b/agent/models_dev.py
index 79cfa90ca9..0ef18f4ce1 100644
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -381,14 +381,18 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
 
     # Extract capability flags (default to False if missing)
     supports_tools = bool(entry.get("tool_call", False))
-    # Vision: check both the `attachment` flag and `modalities.input` for "image".
-    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
+    # Vision: prefer explicit `modalities.input` when models.dev provides it.
+    # The older `attachment` flag can be stale or too broad for image routing;
+    # fall back to it only when the input modalities are absent/invalid.
     input_mods = entry.get("modalities", {})
     if isinstance(input_mods, dict):
-        input_mods = input_mods.get("input", [])
+        input_mods = input_mods.get("input")
     else:
-        input_mods = []
-    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
+        input_mods = None
+    if isinstance(input_mods, list):
+        supports_vision = "image" in input_mods
+    else:
+        supports_vision = bool(entry.get("attachment", False))
     supports_reasoning = bool(entry.get("reasoning", False))
 
     # Extract limits
diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py
index aef7bbda65..89b19a6d42 100644
--- a/tests/agent/test_image_routing.py
+++ b/tests/agent/test_image_routing.py
@@ -109,6 +109,21 @@ class TestDecideImageInputMode:
         with patch("agent.image_routing._lookup_supports_vision", return_value=True):
             assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
 
+    def test_auto_uses_text_for_text_only_modalities_even_with_attachment_flag(self):
+        registry = {
+            "xiaomi": {
+                "models": {
+                    "mimo-v2.5-pro": {
+                        "attachment": True,
+                        "modalities": {"input": ["text"]},
+                        "tool_call": True,
+                    },
+                },
+            },
+        }
+        with patch("agent.models_dev.fetch_models_dev", return_value=registry):
+            assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text"
+
 
 # ─── build_native_content_parts ──────────────────────────────────────────────
 
diff --git a/tests/agent/test_models_dev.py b/tests/agent/test_models_dev.py
index c2a2140186..4eac2bd561 100644
--- a/tests/agent/test_models_dev.py
+++ b/tests/agent/test_models_dev.py
@@ -223,6 +223,13 @@ CAPS_REGISTRY = {
                 "tool_call": True,
                 "limit": {"context": 32000, "output": 8192},
             },
+            "text-only-with-stale-attachment": {
+                "id": "text-only-with-stale-attachment",
+                "attachment": True,
+                "tool_call": True,
+                "modalities": {"input": ["text"]},
+                "limit": {"context": 128000, "output": 8192},
+            },
         },
     },
     "anthropic": {
@@ -243,7 +250,7 @@ class TestGetModelCapabilities:
     """Tests for get_model_capabilities vision detection."""
 
     def test_vision_from_attachment_flag(self):
-        """Models with attachment=True should report supports_vision=True."""
+        """Models with attachment=True and no modalities should report supports_vision=True."""
         with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
             caps = get_model_capabilities("anthropic", "claude-sonnet-4")
         assert caps is not None
@@ -257,6 +264,13 @@ class TestGetModelCapabilities:
         assert caps is not None
         assert caps.supports_vision is True
 
+    def test_text_only_modalities_override_stale_attachment_flag(self):
+        """Text-only modalities must win over stale attachment=True metadata."""
+        with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
+            caps = get_model_capabilities("google", "text-only-with-stale-attachment")
+        assert caps is not None
+        assert caps.supports_vision is False
+
     def test_no_vision_without_attachment_or_modalities(self):
         """Models with neither attachment nor image modality should be non-vision."""
         with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):