From 3d66787a04d29beaec16a723e4c2c2bb4dda2cee Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 24 May 2026 15:01:28 -0700 Subject: [PATCH] fix(vision): route auxiliary.vision.provider=openai to api.openai.com, skip text-only main (#31452) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(vision): route auxiliary.vision.provider=openai to api.openai.com, skip text-only main for vision Fixes #31179. Three coupled fixes so a configured aux vision backend actually serves vision tasks instead of silently routing images to the user's main provider: 1. agent/auxiliary_client.py: `auxiliary..provider: openai` resolves to `custom` + `https://api.openai.com/v1`. "openai" was not in PROVIDER_REGISTRY (we have `openai-codex` for OAuth and `custom` for manual base_url), so the obvious config name silently failed to build a client. User-supplied base_url is still preserved; only the provider name normalises to `custom` so resolution doesn't hit the PROVIDER_REGISTRY-only path. 2. agent/auxiliary_client.py: the vision auto-detect chain now skips the user's main provider when models.dev reports `supports_vision=False`. Without this guard, a misconfigured aux provider would fall back to `auto`, which happily returned the main-provider client. The caller would then send image content to e.g. api.deepseek.com with model `gpt-4o-mini` and get a cryptic `unknown variant 'image_url', expected 'text'` from the provider's parser. 3. tools/vision_tools.py + tools/browser_tool.py: `check_vision_requirements` now mirrors the runtime fallback chain (explicit provider, then auto), so `vision_analyze` shows up whenever vision is actually serviceable. `browser_vision` gets a new `check_browser_vision_requirements` check_fn that AND-gates browser + vision availability, so it doesn't get advertised to the model when the call would fail at runtime. Reproduction (config from the bug report): model.provider: deepseek model.default: deepseek-v4-pro auxiliary.vision.provider: openai auxiliary.vision.model: gpt-4o-mini Before: resolve_vision_provider_client() returns None for the explicit provider, fallback auto returns the deepseek client with model='gpt-4o-mini', image hits api.deepseek.com → 'unknown variant image_url'. vision_analyze hidden from tool list; browser_vision exposed but fails at call time. After: resolves to custom + api.openai.com/v1 with model gpt-4o-mini. vision_analyze and browser_vision both gate correctly on capability. Tests: tests/agent/test_vision_routing_31179.py covers all three fixes (12 cases including the user's exact scenario, base_url preservation, text-only-main skip, capability-unknown permissive fallback, and tool gating parity). Existing 382 tests across auxiliary/vision/image_routing suites still pass. * test(vision): use exact hostname check to silence CodeQL substring-sanitization alert * fix(auxiliary): drop model name from vision-skip debug log to silence CodeQL The new `logger.debug(...)` added in the previous commit interpolated both `main_provider` and `vision_model` (a public model slug \u2014 not sensitive). CodeQL's `py/clear-text-logging-sensitive-data` heuristic re-flagged it twice because the rule mis-detects multi-value interpolations near tainted-via-config provider strings. Drop the model from the log args (provider alone is enough to diagnose the skip; the same sibling branch a few lines up already logs provider only). Behavior unchanged; CodeQL false positive cleared. --- agent/auxiliary_client.py | 84 +++++++ tests/agent/test_vision_routing_31179.py | 297 +++++++++++++++++++++++ tools/browser_tool.py | 20 +- tools/vision_tools.py | 19 +- 4 files changed, 417 insertions(+), 3 deletions(-) create mode 100644 tests/agent/test_vision_routing_31179.py diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 40044a1062e..37880190426 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -3730,6 +3730,37 @@ _VISION_AUTO_PROVIDER_ORDER = ( ) +def _main_model_supports_vision(provider: str, model: Optional[str]) -> bool: + """Return True when ``provider``/``model`` is known to accept image input. + + Used by the vision auto-detect chain to skip the user's main provider + when it's known to be text-only (e.g. DeepSeek, gpt-oss without vision). + Without this guard, ``resolve_vision_provider_client(provider="auto")`` + would happily return the main-provider client and any subsequent image + payload would surface as a cryptic provider-side error + (``unknown variant `image_url`, expected `text```, #31179). + + Returns True when capability lookup is unknown — preserves the historical + behaviour of attempting the call, so providers we haven't catalogued yet + don't silently regress to text-only. + """ + try: + from agent.image_routing import _lookup_supports_vision + from hermes_cli.config import load_config + except ImportError: + return True + try: + supports = _lookup_supports_vision(provider, model, load_config()) + except Exception: # pragma: no cover - defensive + return True + if supports is None: + # No capability data — keep current behaviour and let the call attempt + # happen rather than silently skipping. This avoids false-positive + # skips for new/custom providers. + return True + return bool(supports) + + def _normalize_vision_provider(provider: Optional[str]) -> str: return _normalize_aux_provider(provider) @@ -3870,6 +3901,23 @@ def resolve_vision_provider_client( "vision support) — falling through to aggregator chain", main_provider, ) + elif not _main_model_supports_vision(main_provider, vision_model): + # The main model is known to be text-only (e.g. DeepSeek V4, + # gpt-oss-120b without vision). Building a client and sending + # an image would produce a cryptic provider-side error like + # ``unknown variant `image_url`, expected `text``` (#31179). + # Fall through to the aggregator chain instead. + # + # Only log the provider name (not the model) — mirrors the + # sibling _PROVIDERS_WITHOUT_VISION branch above, and avoids + # CodeQL py/clear-text-logging-sensitive-data heuristic false + # positives on multi-value interpolations. + logger.debug( + "Vision auto-detect: skipping main provider %s " + "(reports no vision capability) — falling through to " + "aggregator chain", + main_provider, + ) else: rpc_client, rpc_model = resolve_provider_client( main_provider, vision_model, @@ -4281,6 +4329,23 @@ def _get_cached_client( return client, model or default_model +# Aliases that target direct REST APIs not modeled as first-class providers +# in PROVIDER_REGISTRY. Used for ``auxiliary..provider`` so users can +# write the obvious name and have it resolve to a working ``custom`` endpoint +# without needing to know our internal provider IDs. +# +# Why these specifically: PROVIDER_REGISTRY has ``openai-codex`` (OAuth) and +# ``custom`` (manual base_url + OPENAI_API_KEY) but no plain ``openai`` for +# direct API-key access. Users predictably type ``provider: openai`` and +# expect it to use OPENAI_API_KEY against api.openai.com. Previously this +# silently fell back to the user's main provider, sending OpenAI model names +# to e.g. DeepSeek and producing cryptic ``unknown variant 'image_url'`` +# errors (issue #31179). +_AUX_DIRECT_API_BASE_URLS: Dict[str, str] = { + "openai": "https://api.openai.com/v1", +} + + def _resolve_task_provider_model( task: str = None, provider: str = None, @@ -4317,6 +4382,25 @@ def _resolve_task_provider_model( resolved_model = model or cfg_model resolved_api_mode = cfg_api_mode + # Convenience aliases for direct API-key endpoints that aren't first-class + # providers (e.g. ``provider: openai`` → custom + api.openai.com/v1). + # Applied to both explicit args and config-derived values. When the user + # has already supplied a base_url we keep their endpoint but still rewrite + # the provider to ``custom`` so resolution doesn't hit the + # PROVIDER_REGISTRY-only path (which has no ``openai`` entry). + def _expand_direct_api_alias(prov: Optional[str], existing_base: Optional[str]) -> Tuple[Optional[str], Optional[str]]: + if not prov: + return prov, existing_base + target_base = _AUX_DIRECT_API_BASE_URLS.get(prov.strip().lower()) + if target_base is None: + return prov, existing_base + return "custom", existing_base or target_base + + if provider: + provider, base_url = _expand_direct_api_alias(provider, base_url) + if cfg_provider: + cfg_provider, cfg_base_url = _expand_direct_api_alias(cfg_provider, cfg_base_url) + if base_url: return "custom", resolved_model, base_url, api_key, resolved_api_mode if provider: diff --git a/tests/agent/test_vision_routing_31179.py b/tests/agent/test_vision_routing_31179.py new file mode 100644 index 00000000000..268cd27aa96 --- /dev/null +++ b/tests/agent/test_vision_routing_31179.py @@ -0,0 +1,297 @@ +"""Regression tests for issue #31179. + +Before the fix: + - ``auxiliary.vision.provider: openai`` silently failed to resolve because + ``openai`` is not a first-class provider in PROVIDER_REGISTRY (only + ``openai-codex`` for OAuth and ``custom`` for OPENAI_BASE_URL). + - The vision branch of ``call_llm`` then silently fell back to ``auto`` + which happily picked the user's main provider (e.g. DeepSeek), sending + image content to a text-only endpoint and producing cryptic + ``unknown variant 'image_url', expected 'text'`` errors. + - ``check_vision_requirements`` used the explicit-only path, so + ``vision_analyze`` disappeared from the tool list while ``browser_vision`` + stayed (its check_fn only validated the browser). + +The three fixes covered here: + 1. ``provider: openai`` in auxiliary task config resolves to + ``custom`` + ``https://api.openai.com/v1``. + 2. The vision auto-detect chain skips the user's main provider when it + reports ``supports_vision=False`` instead of routing image content to + a text-only endpoint. + 3. ``check_vision_requirements`` mirrors the runtime fallback chain so + ``vision_analyze`` shows up whenever the auto chain can serve vision, + and ``browser_vision`` gates on vision availability as well. +""" + +from __future__ import annotations + +import os +import shutil +import sys +import tempfile + +import pytest + + +# --------------------------------------------------------------------------- +# Test infrastructure +# --------------------------------------------------------------------------- + + +@pytest.fixture +def isolated_home(monkeypatch): + """Temp HERMES_HOME with config + clean credential env vars.""" + test_home = tempfile.mkdtemp(prefix="hermes_test_31179_") + hermes_home = os.path.join(test_home, ".hermes") + os.makedirs(hermes_home) + monkeypatch.setenv("HERMES_HOME", hermes_home) + + # Strip all credential-shaped env vars so each scenario starts hermetic. + for k in list(os.environ.keys()): + if k.endswith("_API_KEY") or k.endswith("_TOKEN"): + monkeypatch.delenv(k, raising=False) + + yield hermes_home + shutil.rmtree(test_home, ignore_errors=True) + + +def _write_config(home: str, text: str) -> None: + with open(os.path.join(home, "config.yaml"), "w") as fp: + fp.write(text) + + +def _fresh_modules(): + """Drop cached hermes modules so each test reloads against current env.""" + for mod in list(sys.modules.keys()): + if mod.startswith(("agent.auxiliary_client", "agent.image_routing", + "tools.vision_tools", "tools.browser_tool", + "hermes_cli.config")): + del sys.modules[mod] + + +# --------------------------------------------------------------------------- +# Fix 1: provider=openai → custom + api.openai.com/v1 +# --------------------------------------------------------------------------- + + +class TestOpenAiAliasForAuxiliary: + """``auxiliary..provider: openai`` should produce a working client.""" + + def test_provider_openai_routes_to_openai_dot_com(self, isolated_home, monkeypatch): + _write_config(isolated_home, """ +auxiliary: + vision: + provider: openai + model: gpt-4o-mini +""") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test") + _fresh_modules() + + from agent.auxiliary_client import _resolve_task_provider_model + provider, model, base_url, _key, _mode = _resolve_task_provider_model("vision") + assert provider == "custom" + assert model == "gpt-4o-mini" + assert base_url == "https://api.openai.com/v1" + + def test_provider_openai_with_explicit_base_url_preserves_user_endpoint( + self, isolated_home, monkeypatch + ): + """User-supplied base_url wins; alias still normalizes provider name + to ``custom`` so resolution doesn't hit the unknown-provider path.""" + _write_config(isolated_home, """ +auxiliary: + vision: + provider: openai + model: gpt-4o-mini + base_url: https://my-proxy.example.com/v1 +""") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test") + _fresh_modules() + + from agent.auxiliary_client import _resolve_task_provider_model + provider, _model, base_url, _key, _mode = _resolve_task_provider_model("vision") + assert provider == "custom" + assert base_url == "https://my-proxy.example.com/v1" + + def test_provider_openai_resolves_to_working_client(self, isolated_home, monkeypatch): + """End-to-end: the resolved client points at api.openai.com.""" + _write_config(isolated_home, """ +auxiliary: + vision: + provider: openai + model: gpt-4o-mini +""") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test") + _fresh_modules() + + from agent.auxiliary_client import resolve_vision_provider_client + from urllib.parse import urlparse + provider, client, model = resolve_vision_provider_client() + assert client is not None, "openai alias should produce a usable client" + # Exact hostname comparison (not substring) — defends against URLs + # like ``api.openai.com.evil.example`` and keeps CodeQL happy. + host = urlparse(str(getattr(client, "base_url", ""))).hostname or "" + assert host == "api.openai.com", f"expected api.openai.com host, got {host!r}" + assert model == "gpt-4o-mini" + + +# --------------------------------------------------------------------------- +# Fix 2: auto chain skips text-only main providers +# --------------------------------------------------------------------------- + + +class TestTextOnlyMainSkippedForVision: + """Vision auto-detect must not return a text-only main-provider client.""" + + def test_text_only_main_skipped_when_no_aggregator(self, isolated_home, monkeypatch): + """DeepSeek main + no aggregator credentials → no client built. + + Pre-fix this silently returned the deepseek client with model + substitution, producing ``unknown variant 'image_url'`` at call time. + """ + _write_config(isolated_home, """ +model: + provider: deepseek + default: deepseek-v4-pro +""") + monkeypatch.setenv("DEEPSEEK_API_KEY", "sk-test") + _fresh_modules() + + from agent.auxiliary_client import resolve_vision_provider_client + provider, client, _model = resolve_vision_provider_client(provider="auto") + assert client is None, ( + f"Vision auto-detect must skip text-only main {provider!r} when " + "no vision-capable aggregator is available, not return a client " + "that will fail at API time" + ) + + def test_vision_capable_main_used(self, isolated_home, monkeypatch): + """Vision-capable main provider should be returned by auto chain.""" + _write_config(isolated_home, """ +model: + provider: anthropic + default: claude-sonnet-4-6 +""") + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test") + _fresh_modules() + + from agent.auxiliary_client import resolve_vision_provider_client + provider, client, _model = resolve_vision_provider_client(provider="auto") + assert client is not None + assert provider == "anthropic" + + def test_unknown_capability_does_not_block(self, isolated_home, monkeypatch): + """When models.dev has no entry, fall back to permissive (attempt the call). + + This keeps new/custom providers working — only providers we have + cataloged as text-only are skipped. + """ + _fresh_modules() + from agent.auxiliary_client import _main_model_supports_vision + # Bogus provider/model — capability lookup returns None → permissive. + assert _main_model_supports_vision("nonexistent-provider", "nonexistent-model") is True + + +# --------------------------------------------------------------------------- +# Fix 3: check_vision_requirements + check_browser_vision_requirements parity +# --------------------------------------------------------------------------- + + +class TestVisionToolGating: + """Tool visibility must match runtime capability.""" + + def test_check_vision_succeeds_for_aliased_openai(self, isolated_home, monkeypatch): + """The user's exact reported scenario: provider=openai unhides + vision_analyze instead of silently dropping it.""" + _write_config(isolated_home, """ +auxiliary: + vision: + provider: openai + model: gpt-4o-mini +""") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test") + _fresh_modules() + + from tools.vision_tools import check_vision_requirements + assert check_vision_requirements() is True + + def test_check_vision_falls_back_to_auto(self, isolated_home, monkeypatch): + """Bad explicit provider doesn't hide the tool when auto fallback works. + + Mirrors call_llm's runtime fallback chain. + """ + _write_config(isolated_home, """ +model: + provider: openrouter + default: anthropic/claude-sonnet-4 +auxiliary: + vision: + provider: not-a-real-provider +""") + monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test") + _fresh_modules() + + from tools.vision_tools import check_vision_requirements + assert check_vision_requirements() is True + + def test_check_vision_false_with_text_only_main_and_no_aggregator( + self, isolated_home, monkeypatch + ): + _write_config(isolated_home, """ +model: + provider: deepseek + default: deepseek-v4-pro +""") + monkeypatch.setenv("DEEPSEEK_API_KEY", "sk-test") + _fresh_modules() + + from tools.vision_tools import check_vision_requirements + assert check_vision_requirements() is False + + def test_browser_vision_requires_both_browser_and_vision(self, isolated_home, monkeypatch): + """``browser_vision`` must not be advertised when vision is unavailable.""" + from unittest.mock import patch + + _write_config(isolated_home, """ +model: + provider: deepseek + default: deepseek-v4-pro +""") + monkeypatch.setenv("DEEPSEEK_API_KEY", "sk-test") + _fresh_modules() + + import tools.browser_tool + # Force the browser side to True so we exercise the vision-gating part. + with patch.object(tools.browser_tool, "check_browser_requirements", return_value=True): + assert tools.browser_tool.check_browser_vision_requirements() is False + + def test_browser_vision_false_when_browser_missing(self, isolated_home, monkeypatch): + from unittest.mock import patch + + _write_config(isolated_home, """ +model: + provider: openrouter + default: anthropic/claude-sonnet-4 +""") + monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test") + _fresh_modules() + + import tools.browser_tool + with patch.object(tools.browser_tool, "check_browser_requirements", return_value=False): + # Vision available but browser missing → still False. + assert tools.browser_tool.check_browser_vision_requirements() is False + + def test_browser_vision_true_when_both_available(self, isolated_home, monkeypatch): + from unittest.mock import patch + + _write_config(isolated_home, """ +model: + provider: openrouter + default: anthropic/claude-sonnet-4 +""") + monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-test") + _fresh_modules() + + import tools.browser_tool + with patch.object(tools.browser_tool, "check_browser_requirements", return_value=True): + assert tools.browser_tool.check_browser_vision_requirements() is True diff --git a/tools/browser_tool.py b/tools/browser_tool.py index f252544edc1..5320d6adfdb 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -3652,6 +3652,24 @@ def check_browser_requirements() -> bool: return True +def check_browser_vision_requirements() -> bool: + """Whether ``browser_vision`` should be advertised to the model. + + Requires BOTH a working browser (``check_browser_requirements``) AND a + resolvable vision backend. Without the vision check, the tool stays in + the model's tool list even when no vision provider is configured, then + fails at call time with a cryptic provider-side error like + ``unknown variant `image_url`, expected `text``` (issue #31179). + """ + if not check_browser_requirements(): + return False + try: + from tools.vision_tools import check_vision_requirements + except ImportError: + return False + return check_vision_requirements() + + # ============================================================================ # Module Test # ============================================================================ @@ -3786,7 +3804,7 @@ registry.register( toolset="browser", schema=_BROWSER_SCHEMA_MAP["browser_vision"], handler=lambda args, **kw: browser_vision(question=args.get("question", ""), annotate=args.get("annotate", False), task_id=kw.get("task_id")), - check_fn=check_browser_requirements, + check_fn=check_browser_vision_requirements, emoji="👁️", ) registry.register( diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 912777e2e25..38d19919488 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -914,11 +914,26 @@ async def vision_analyze_tool( def check_vision_requirements() -> bool: - """Check if the configured runtime vision path can resolve a client.""" + """Check if the configured runtime vision path can resolve a client. + + Mirrors the fallback chain that ``call_llm(task="vision")`` actually uses + at runtime: first the explicit ``auxiliary.vision.provider`` (if any), + and if that fails, the auto chain (main provider → openrouter → nous). + Without the auto-fallback step the tool would disappear from the model's + tool list whenever the explicit provider name was unresolvable, even + when the auto chain would have served the request (issue #31179). + """ try: from agent.auxiliary_client import resolve_vision_provider_client - + except ImportError: + return False + try: _provider, client, _model = resolve_vision_provider_client() + if client is not None: + return True + # Same fallback to "auto" that call_llm performs when the configured + # provider can't be resolved. + _provider, client, _model = resolve_vision_provider_client(provider="auto") return client is not None except Exception: return False