From 32aea113f090e6718c9d13eea14f0eb52dbd52b6 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 20 May 2026 22:59:14 -0700 Subject: [PATCH] fix(agent): consult supports_vision override in auto-mode routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The contributor PR (#17936) only patched the strip path in `_model_supports_vision()`. The auto-mode router in `agent/image_routing._lookup_supports_vision` still only read models.dev, so a custom-provider model declared as vision-capable would still get its images routed through vision_analyze in the default `agent.image_input_mode: auto` setting. Users had to set both `supports_vision: true` AND `image_input_mode: native` to bypass the text pipeline. Single-knob behavior now: `supports_vision: true` alone is enough in auto mode. The strip path and the routing path consult the same resolver. - Extract override resolution into `_supports_vision_override()` in agent/image_routing.py and wire it into `_lookup_supports_vision()`. - Refactor `run_agent._model_supports_vision` to call the same helper (DRY, single source of truth for the resolution order). - Strict YAML boolean coercion: `supports_vision: "false"` (quoted — a common YAML mistake) no longer coerces to True via bool() truthiness. Recognised tokens: true/false/yes/no/on/off/1/0 plus real bools and 0/1. Unrecognised values return None and fall through to models.dev. - Add @CNSeniorious000 to AUTHOR_MAP for release attribution. Tests: 26 new (TestCoerceCapabilityBool, TestSupportsVisionOverride, TestLookupSupportsVisionOverride, TestAutoModeRespectsOverride). Existing contributor tests + image_routing + vision_native_fast_path + native_image_buffer_isolation all green (92/92). --- agent/image_routing.py | 96 ++++++++++++++++- run_agent.py | 24 +---- scripts/release.py | 1 + tests/agent/test_image_routing.py | 165 ++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+), 23 deletions(-) diff --git a/agent/image_routing.py b/agent/image_routing.py index d5247ab222f..37e1cbbf102 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -46,6 +46,84 @@ logger = logging.getLogger(__name__) _VALID_MODES = frozenset({"auto", "native", "text"}) +# Strict YAML/JSON boolean coercion for capability overrides. +# +# ``bool("false")`` is True in Python because non-empty strings are truthy, so +# a user writing ``supports_vision: "false"`` (quoted — a common YAML mistake) +# would silently enable native vision routing on a model that can't actually +# handle it. Accept only the values YAML 1.1 / 1.2 treat as booleans, plus +# real ``bool`` and integer 0/1. Anything else returns None so the caller +# falls through to models.dev rather than honouring garbage. +_TRUE_TOKENS = frozenset({"true", "yes", "on", "1"}) +_FALSE_TOKENS = frozenset({"false", "no", "off", "0"}) + + +def _coerce_capability_bool(raw: Any) -> Optional[bool]: + """Return True/False for recognised boolean values, None otherwise.""" + if isinstance(raw, bool): + return raw + if isinstance(raw, int): + if raw in (0, 1): + return bool(raw) + return None + if isinstance(raw, str): + s = raw.strip().lower() + if s in _TRUE_TOKENS: + return True + if s in _FALSE_TOKENS: + return False + return None + + +def _supports_vision_override( + cfg: Optional[Dict[str, Any]], + provider: str, + model: str, +) -> Optional[bool]: + """Resolve user-declared vision capability from config.yaml. + + Resolution order, first hit wins: + 1. ``model.supports_vision`` (top-level shortcut for the active model) + 2. ``providers..models..supports_vision`` + (named custom providers — ``provider`` may be the runtime-resolved + value ``"custom"`` and/or the user-declared name under + ``model.provider``; both are tried) + + Returns None when no override is set, so the caller falls through to + models.dev. Returns False explicitly only when the user wrote a + recognised boolean false token. + """ + if not isinstance(cfg, dict): + return None + + # 1. Top-level shortcut + model_cfg_raw = cfg.get("model") + model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {} + top = _coerce_capability_bool(model_cfg.get("supports_vision")) + if top is not None: + return top + + # 2. Per-provider, per-model. Named custom providers (e.g. "my-vllm") + # get rewritten to provider="custom" at runtime + # (hermes_cli/runtime_provider.py:_resolve_named_custom_runtime), so the + # config still holds the user-declared name under model.provider. Try + # both as candidate provider keys. + config_provider = str(model_cfg.get("provider") or "").strip() + providers_raw = cfg.get("providers") + providers_cfg: Dict[str, Any] = providers_raw if isinstance(providers_raw, dict) else {} + for p in dict.fromkeys(filter(None, (provider, config_provider))): + entry_raw = providers_cfg.get(p) + entry: Dict[str, Any] = entry_raw if isinstance(entry_raw, dict) else {} + models_raw = entry.get("models") + models_cfg: Dict[str, Any] = models_raw if isinstance(models_raw, dict) else {} + per_model_raw = models_cfg.get(model) + per_model: Dict[str, Any] = per_model_raw if isinstance(per_model_raw, dict) else {} + coerced = _coerce_capability_bool(per_model.get("supports_vision")) + if coerced is not None: + return coerced + return None + + def _coerce_mode(raw: Any) -> str: """Normalize a config value into one of the valid modes.""" if not isinstance(raw, str): @@ -81,8 +159,20 @@ def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: return True -def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]: - """Return True/False if we can resolve caps, None if unknown.""" +def _lookup_supports_vision( + provider: str, + model: str, + cfg: Optional[Dict[str, Any]] = None, +) -> Optional[bool]: + """Return True/False if we can resolve caps, None if unknown. + + Consults the user's ``supports_vision`` override in config.yaml first + (so custom/local models declared as vision-capable don't fall through to + text routing in ``auto`` mode), then falls back to models.dev. + """ + override = _supports_vision_override(cfg, provider, model) + if override is not None: + return override if not provider or not model: return None try: @@ -123,7 +213,7 @@ def decide_image_input_mode( if _explicit_aux_vision_override(cfg): return "text" - supports = _lookup_supports_vision(provider, model) + supports = _lookup_supports_vision(provider, model, cfg) if supports is True: return "native" return "text" diff --git a/run_agent.py b/run_agent.py index 34d591e59a8..001d03784ad 100644 --- a/run_agent.py +++ b/run_agent.py @@ -3201,7 +3201,7 @@ class AIAgent: messages (for non-vision models) or let the provider adapter handle them natively (for vision-capable models). - Resolution order: + Resolution order (see ``agent.image_routing._supports_vision_override``): 1. ``model.supports_vision`` (top-level, single-model shortcut) 2. ``providers..models..supports_vision`` 3. models.dev capability lookup @@ -3209,28 +3209,12 @@ class AIAgent: misclassified as non-vision and have their images stripped. """ try: - from hermes_cli.config import cfg_get, load_config + from hermes_cli.config import load_config + from agent.image_routing import _lookup_supports_vision cfg = load_config() provider = (getattr(self, "provider", "") or "").strip() model = (getattr(self, "model", "") or "").strip() - # self.provider is the runtime-resolved value, which is rewritten - # to "custom" for named custom providers (see - # hermes_cli/runtime_provider.py:_resolve_named_custom_runtime), - # while the config still holds the user-declared name (e.g. - # "my-vllm") under model.provider. Try both as provider keys. - config_provider = str(cfg_get(cfg, "model", "provider") or "").strip() - candidates = [("model", "supports_vision")] - for p in dict.fromkeys(filter(None, (provider, config_provider))): - candidates.append(("providers", p, "models", model, "supports_vision")) - for keys in candidates: - override = cfg_get(cfg, *keys) - if override is not None: - return bool(override) - from agent.models_dev import get_model_capabilities - if not provider or not model: - return False - caps = get_model_capabilities(provider, model) - return bool(caps and caps.supports_vision) + return _lookup_supports_vision(provider, model, cfg) is True except Exception: return False diff --git a/scripts/release.py b/scripts/release.py index eaae1c576e8..ce427414cd7 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -47,6 +47,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" AUTHOR_MAP = { # teknium (multiple emails) "teknium1@gmail.com": "teknium1", + "me@promplate.dev": "CNSeniorious000", "erhanyasarx@gmail.com": "erhnysr", "30366221+WorldWriter@users.noreply.github.com": "WorldWriter", "dafeng@DafengdeMacBook-Pro.local": "WorldWriter", diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index 75f842b4711..ddb11cba409 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -9,8 +9,11 @@ from unittest.mock import patch import pytest from agent.image_routing import ( + _coerce_capability_bool, _coerce_mode, _explicit_aux_vision_override, + _lookup_supports_vision, + _supports_vision_override, build_native_content_parts, decide_image_input_mode, ) @@ -125,6 +128,168 @@ class TestDecideImageInputMode: assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text" +# ─── _coerce_capability_bool ───────────────────────────────────────────────── + + +class TestCoerceCapabilityBool: + def test_real_bool_passes_through(self): + assert _coerce_capability_bool(True) is True + assert _coerce_capability_bool(False) is False + + def test_int_0_and_1(self): + assert _coerce_capability_bool(1) is True + assert _coerce_capability_bool(0) is False + + def test_other_ints_return_none(self): + assert _coerce_capability_bool(2) is None + assert _coerce_capability_bool(-1) is None + + def test_yaml_true_tokens(self): + for s in ("true", "TRUE", "True", "yes", "on", "1", " true "): + assert _coerce_capability_bool(s) is True + + def test_yaml_false_tokens(self): + for s in ("false", "FALSE", "False", "no", "off", "0", " false "): + assert _coerce_capability_bool(s) is False + + def test_quoted_false_does_not_silently_become_true(self): + # Regression: bool("false") is True in Python. A user writing + # supports_vision: "false" must NOT enable native vision routing. + assert _coerce_capability_bool("false") is False + + def test_unrecognised_strings_return_none(self): + # None == fall through to models.dev, not a silent truthy. + assert _coerce_capability_bool("maybe") is None + assert _coerce_capability_bool("") is None + assert _coerce_capability_bool("definitely") is None + + def test_other_types_return_none(self): + assert _coerce_capability_bool(None) is None + assert _coerce_capability_bool([]) is None + assert _coerce_capability_bool({}) is None + assert _coerce_capability_bool(1.5) is None + + +# ─── _supports_vision_override ─────────────────────────────────────────────── + + +class TestSupportsVisionOverride: + def test_no_cfg_returns_none(self): + assert _supports_vision_override(None, "custom", "my-llava") is None + assert _supports_vision_override({}, "custom", "my-llava") is None + + def test_top_level_shortcut_wins(self): + cfg = {"model": {"supports_vision": True}} + assert _supports_vision_override(cfg, "custom", "my-llava") is True + + def test_top_level_false_propagates(self): + cfg = {"model": {"supports_vision": False}} + assert _supports_vision_override(cfg, "custom", "my-llava") is False + + def test_per_provider_per_model_via_runtime_name(self): + cfg = { + "providers": { + "custom": {"models": {"my-llava": {"supports_vision": True}}}, + }, + } + assert _supports_vision_override(cfg, "custom", "my-llava") is True + + def test_per_provider_per_model_via_config_name(self): + # Named custom provider — runtime self.provider == "custom", config + # holds the original name under model.provider. + cfg = { + "model": {"provider": "my-vllm"}, + "providers": { + "my-vllm": {"models": {"my-llava": {"supports_vision": True}}}, + }, + } + assert _supports_vision_override(cfg, "custom", "my-llava") is True + + def test_quoted_false_string_in_yaml_does_not_enable(self): + # Real-world: user writes supports_vision: "false" (quoted). + cfg = {"model": {"supports_vision": "false"}} + assert _supports_vision_override(cfg, "custom", "my-llava") is False + + def test_unrecognised_value_falls_through(self): + cfg = {"model": {"supports_vision": "maybe"}} + assert _supports_vision_override(cfg, "custom", "my-llava") is None + + def test_no_override_returns_none(self): + cfg = {"model": {"default": "my-llava"}} + assert _supports_vision_override(cfg, "custom", "my-llava") is None + + def test_malformed_sections_are_ignored(self): + # User accidentally wrote a string where a section was expected — + # don't blow up, just fall through. + cfg = {"model": "some-string", "providers": ["not-a-dict"]} + assert _supports_vision_override(cfg, "custom", "my-llava") is None + + +# ─── _lookup_supports_vision (override-aware) ──────────────────────────────── + + +class TestLookupSupportsVisionOverride: + def test_config_override_short_circuits_models_dev(self): + # Config says True, models.dev says None — config wins. + cfg = {"model": {"supports_vision": True}} + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert _lookup_supports_vision("custom", "my-llava", cfg) is True + + def test_config_override_false_beats_vision_capable_models_dev(self): + # User explicitly disables vision on a models.dev-vision-capable model. + fake_caps = type("Caps", (), {"supports_vision": True})() + cfg = {"model": {"supports_vision": False}} + with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps): + assert _lookup_supports_vision("anthropic", "claude-sonnet-4", cfg) is False + + def test_no_override_falls_back_to_models_dev(self): + fake_caps = type("Caps", (), {"supports_vision": True})() + with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps): + assert _lookup_supports_vision("anthropic", "claude-sonnet-4", {}) is True + + def test_no_override_no_models_dev_entry_returns_none(self): + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert _lookup_supports_vision("custom", "my-llava", {}) is None + + def test_cfg_none_falls_back_to_models_dev(self): + # Caller didn't pass cfg at all — old call sites must still work. + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert _lookup_supports_vision("openrouter", "x", None) is None + + +# ─── decide_image_input_mode with auto + override ──────────────────────────── + + +class TestAutoModeRespectsOverride: + def test_auto_native_for_custom_with_supports_vision_true(self): + # The motivating bug: Qwen3.6 on local llama.cpp via provider=custom. + # Without the override, auto falls back to text. With it, auto picks + # native — no need to also set agent.image_input_mode: native. + cfg = {"model": {"supports_vision": True}} + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native" + + def test_auto_text_for_custom_with_supports_vision_false(self): + cfg = {"model": {"supports_vision": False}} + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "some-text-only", cfg) == "text" + + def test_auto_text_for_custom_with_no_override(self): + # Unchanged baseline: unknown custom model → text. + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "unknown", {}) == "text" + + def test_explicit_aux_vision_override_still_wins(self): + # If the user has configured a dedicated vision aux backend, respect + # it even when supports_vision: true is also set. + cfg = { + "model": {"supports_vision": True}, + "auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}}, + } + with patch("agent.models_dev.get_model_capabilities", return_value=None): + assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text" + + # ─── build_native_content_parts ──────────────────────────────────────────────