fix(models): prefer image modalities for vision routing

This commit is contained in:
LeonSGP43 2026-05-03 19:32:26 +08:00 committed by Teknium
parent 6e46f99e7e
commit 14f38822fa
3 changed files with 39 additions and 6 deletions

View file

@ -381,14 +381,18 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
# Extract capability flags (default to False if missing)
supports_tools = bool(entry.get("tool_call", False))
# Vision: check both the `attachment` flag and `modalities.input` for "image".
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
# Vision: prefer explicit `modalities.input` when models.dev provides it.
# The older `attachment` flag can be stale or too broad for image routing;
# fall back to it only when the input modalities are absent/invalid.
input_mods = entry.get("modalities", {})
if isinstance(input_mods, dict):
input_mods = input_mods.get("input", [])
input_mods = input_mods.get("input")
else:
input_mods = []
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
input_mods = None
if isinstance(input_mods, list):
supports_vision = "image" in input_mods
else:
supports_vision = bool(entry.get("attachment", False))
supports_reasoning = bool(entry.get("reasoning", False))
# Extract limits

View file

@ -109,6 +109,21 @@ class TestDecideImageInputMode:
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
def test_auto_uses_text_for_text_only_modalities_even_with_attachment_flag(self):
registry = {
"xiaomi": {
"models": {
"mimo-v2.5-pro": {
"attachment": True,
"modalities": {"input": ["text"]},
"tool_call": True,
},
},
},
}
with patch("agent.models_dev.fetch_models_dev", return_value=registry):
assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text"
# ─── build_native_content_parts ──────────────────────────────────────────────

View file

@ -223,6 +223,13 @@ CAPS_REGISTRY = {
"tool_call": True,
"limit": {"context": 32000, "output": 8192},
},
"text-only-with-stale-attachment": {
"id": "text-only-with-stale-attachment",
"attachment": True,
"tool_call": True,
"modalities": {"input": ["text"]},
"limit": {"context": 128000, "output": 8192},
},
},
},
"anthropic": {
@ -243,7 +250,7 @@ class TestGetModelCapabilities:
"""Tests for get_model_capabilities vision detection."""
def test_vision_from_attachment_flag(self):
"""Models with attachment=True should report supports_vision=True."""
"""Models with attachment=True and no modalities should report supports_vision=True."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("anthropic", "claude-sonnet-4")
assert caps is not None
@ -257,6 +264,13 @@ class TestGetModelCapabilities:
assert caps is not None
assert caps.supports_vision is True
def test_text_only_modalities_override_stale_attachment_flag(self):
"""Text-only modalities must win over stale attachment=True metadata."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):
caps = get_model_capabilities("google", "text-only-with-stale-attachment")
assert caps is not None
assert caps.supports_vision is False
def test_no_vision_without_attachment_or_modalities(self):
"""Models with neither attachment nor image modality should be non-vision."""
with patch("agent.models_dev.fetch_models_dev", return_value=CAPS_REGISTRY):