refactor(vision): consolidate native fast-path gate into one shared helper

The fast-path decision (native routing + provider allowlist OR supports_vision override) lived inline in vision_analyze and was copied into browser_vision. Extract it to _should_use_native_vision_fast_path() so both tools share one source of truth. - vision_tools: gate logic now one helper; vision_analyze calls it in 3 lines - browser_tool: thin envelope decoration over the shared helper, not a copy - browser_vision typed Union[str, Dict] to match its real return shape - tests slimmed to target the override path + text-mode-wins invariant
2026-07-27 17:58:07 +00:00 · 2026-05-29 03:40:26 -07:00 · 2026-05-29 03:40:26 -07:00 · c77a697fa4
commit c77a697fa4
parent c3f28c651d
4 changed files with 140 additions and 193 deletions
--- a/tests/tools/test_browser_console.py
+++ b/tests/tools/test_browser_console.py
@ -251,6 +251,7 @@ class TestBrowserVisionConfig:
        assert mock_llm.call_args.kwargs["timeout"] == 120.0

    def test_browser_vision_native_fast_path_returns_multimodal(self, tmp_path):
+        """supports_vision override → screenshot attached natively, no aux call."""
        from agent.auxiliary_client import clear_runtime_main, set_runtime_main
        from tools.browser_tool import browser_vision

@ -265,10 +266,7 @@ class TestBrowserVisionConfig:
                    "tools.browser_tool._run_browser_command",
                    return_value={
                        "success": True,
-                        "data": {
-                            "path": str(screenshot),
-                            "annotations": annotations,
-                        },
+                        "data": {"path": str(screenshot), "annotations": annotations},
                    },
                ),
                patch(
@ -278,9 +276,7 @@ class TestBrowserVisionConfig:
                patch("tools.browser_tool._get_vision_model") as mock_get_vision_model,
                patch("tools.browser_tool.call_llm") as mock_llm,
            ):
-                result = browser_vision(
-                    "what is on the page?", annotate=True, task_id="test"
-                )
+                result = browser_vision("what is on the page?", annotate=True, task_id="test")
        finally:
            clear_runtime_main()

@ -289,55 +285,12 @@ class TestBrowserVisionConfig:
        assert result["meta"]["screenshot_path"] == str(screenshot)
        assert result["meta"]["annotations"] == annotations
        assert any(p.get("type") == "image_url" for p in result["content"])
-        assert "what is on the page?" in result["content"][0]["text"]
-        assert str(screenshot) in result["content"][0]["text"]
-        assert "Screenshot path:" in result["text_summary"]
+        assert f"Screenshot path: {screenshot}" in result["text_summary"]
        mock_get_vision_model.assert_not_called()
        mock_llm.assert_not_called()

-    def test_browser_vision_native_mode_without_supports_vision_uses_aux_llm(self, tmp_path):
-        from agent.auxiliary_client import clear_runtime_main, set_runtime_main
-        from tools.browser_tool import browser_vision
-
-        shots_dir, screenshot = self._setup_screenshot(tmp_path)
-        mock_response = MagicMock()
-        mock_choice = MagicMock()
-        mock_choice.message.content = "Fallback screenshot analysis"
-        mock_response.choices = [mock_choice]
-
-        set_runtime_main("brand-new-provider", "opaque-model")
-        try:
-            with (
-                patch("hermes_constants.get_hermes_dir", return_value=shots_dir),
-                patch("tools.browser_tool._cleanup_old_screenshots"),
-                patch(
-                    "tools.browser_tool._run_browser_command",
-                    return_value={"success": True, "data": {"path": str(screenshot)}},
-                ),
-                patch(
-                    "hermes_cli.config.load_config",
-                    return_value={"agent": {"image_input_mode": "native"}},
-                ),
-                patch("tools.browser_tool._get_vision_model", return_value="test-model"),
-                patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm,
-            ):
-                result = json.loads(browser_vision("what is on the page?", task_id="test"))
-        finally:
-            clear_runtime_main()
-
-        assert result["success"] is True
-        assert result["analysis"] == "Fallback screenshot analysis"
-        assert result["screenshot_path"] == str(screenshot)
-        mock_llm.assert_called_once()
-        kwargs = mock_llm.call_args.kwargs
-        assert kwargs["task"] == "vision"
-        assert kwargs["model"] == "test-model"
-        assert kwargs["messages"][0]["content"][1]["type"] == "image_url"
-        assert kwargs["messages"][0]["content"][1]["image_url"]["url"].startswith(
-            "data:image/png;base64,"
-        )
-
    def test_browser_vision_text_mode_blocks_native_fast_path(self, tmp_path):
+        """Explicit text routing → aux LLM used even with supports_vision."""
        from agent.auxiliary_client import clear_runtime_main, set_runtime_main
        from tools.browser_tool import browser_vision

@ -372,7 +325,6 @@ class TestBrowserVisionConfig:

        assert result["success"] is True
        assert result["analysis"] == "Text-mode screenshot analysis"
-        assert result["screenshot_path"] == str(screenshot)
        mock_llm.assert_called_once()


--- a/tests/tools/test_vision_native_fast_path.py
+++ b/tests/tools/test_vision_native_fast_path.py
@ -146,35 +146,32 @@ class TestVisionAnalyzeNative:
 class TestHandleVisionAnalyzeFastPath:
    """Verify the dispatcher chooses fast-path vs aux-LLM correctly."""

-    def test_native_mode_with_supported_transport_uses_fast_path(self, tmp_path):
-        """Explicit native mode + known transport returns multimodal."""
+    def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
+        """Main model supports native vision → fast path returns multimodal."""
        img = tmp_path / "x.png"
        img.write_bytes(_TINY_PNG)

-        async def _aux_sentinel(*args, **kwargs):
-            return '{"sentinel": "aux-path"}'
-
+        # Set runtime override so the handler thinks we're on opus@openrouter
        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
        set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
        try:
+            # Mock decide_image_input_mode to always return "native" so the
+            # fast path fires regardless of model-catalog state in CI.
            with patch(
-                "hermes_cli.config.load_config",
-                return_value={"agent": {"image_input_mode": "native"}},
-            ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
-                result = asyncio.get_event_loop().run_until_complete(
-                    _handle_vision_analyze({"image_url": str(img), "question": "?"})
-                )
+                "agent.image_routing.decide_image_input_mode",
+                return_value="native",
+            ):
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
        finally:
            clear_runtime_main()

-        assert isinstance(result, dict), (
+        assert isinstance(result, dict), \
            f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
-        )
        assert result.get("_multimodal") is True
-        mock_aux.assert_not_called()

-    def test_native_mode_with_unsupported_transport_falls_through(self, tmp_path):
-        """Explicit native mode still respects the transport gate."""
+    def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
+        """Non-vision main model → fast path skipped, aux LLM path attempted."""
        img = tmp_path / "x.png"
        img.write_bytes(_TINY_PNG)

@ -182,27 +179,39 @@ class TestHandleVisionAnalyzeFastPath:
            return '{"sentinel": "aux-path"}'

        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
-        set_runtime_main("brand-new-provider", "opaque-model")
+        set_runtime_main("openrouter", "qwen/qwen3-coder")
        try:
-            with (
-                patch(
-                    "hermes_cli.config.load_config",
-                    return_value={"agent": {"image_input_mode": "native"}},
-                ),
-                patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
-            ):
-                result = asyncio.get_event_loop().run_until_complete(
-                    _handle_vision_analyze({"image_url": str(img), "question": "?"})
-                )
+            with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
        finally:
            clear_runtime_main()

-        assert isinstance(result, str)
-        assert json.loads(result) == {"sentinel": "aux-path"}
-        mock_aux.assert_called_once()
+        assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
+            "Fast path fired for non-vision model; should have fallen through to aux LLM"

-    def test_supports_vision_bypasses_transport_gate(self, tmp_path):
-        """supports_vision=True enables fast path even on unknown providers."""
+    def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
+        """Even with vision-capable model, unknown provider → fall through."""
+        img = tmp_path / "x.png"
+        img.write_bytes(_TINY_PNG)
+
+        async def _aux_sentinel(*args, **kwargs):
+            return '{"sentinel": "aux-path"}'
+
+        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
+        set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
+        try:
+            with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
+        finally:
+            clear_runtime_main()
+
+        assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
+            "Fast path fired for unknown provider; should have fallen through"
+
+    def test_supports_vision_override_bypasses_provider_allowlist(self, tmp_path):
+        """supports_vision=true enables the fast path on an unlisted provider."""
        img = tmp_path / "x.png"
        img.write_bytes(_TINY_PNG)

@ -215,21 +224,19 @@ class TestHandleVisionAnalyzeFastPath:
            with patch(
                "hermes_cli.config.load_config",
                return_value={"model": {"supports_vision": True}},
-            ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
-                result = asyncio.get_event_loop().run_until_complete(
-                    _handle_vision_analyze({"image_url": str(img), "question": "?"})
-                )
+            ), patch(
+                "tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
+            ) as mock_aux:
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
        finally:
            clear_runtime_main()

-        assert isinstance(result, dict), (
-            f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
-        )
-        assert result.get("_multimodal") is True
+        assert isinstance(result, dict) and result.get("_multimodal") is True
        mock_aux.assert_not_called()

-    def test_text_mode_still_blocks_fast_path_when_supports_vision_true(self, tmp_path):
-        """Routing mode wins over supports_vision when text mode was chosen."""
+    def test_text_mode_wins_over_supports_vision_override(self, tmp_path):
+        """Explicit text routing blocks the fast path even with supports_vision."""
        img = tmp_path / "x.png"
        img.write_bytes(_TINY_PNG)

@ -239,19 +246,17 @@ class TestHandleVisionAnalyzeFastPath:
        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
        set_runtime_main("brand-new-provider", "llava-v1.6")
        try:
-            with (
-                patch(
-                    "hermes_cli.config.load_config",
-                    return_value={
-                        "agent": {"image_input_mode": "text"},
-                        "model": {"supports_vision": True},
-                    },
-                ),
-                patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
-            ):
-                result = asyncio.get_event_loop().run_until_complete(
-                    _handle_vision_analyze({"image_url": str(img), "question": "?"})
-                )
+            with patch(
+                "hermes_cli.config.load_config",
+                return_value={
+                    "agent": {"image_input_mode": "text"},
+                    "model": {"supports_vision": True},
+                },
+            ), patch(
+                "tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
+            ) as mock_aux:
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
        finally:
            clear_runtime_main()

--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -62,7 +62,7 @@ import tempfile
 import threading
 import time
 import requests
-from typing import Dict, Any, Optional, List, Tuple
+from typing import Dict, Any, Optional, List, Tuple, Union
 from pathlib import Path
 from agent.auxiliary_client import call_llm
 from hermes_constants import get_hermes_home
@ -3044,16 +3044,16 @@ def browser_get_images(task_id: Optional[str] = None) -> str:
        return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False)


-def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
+def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> Union[str, Dict[str, Any]]:
    """
    Take a screenshot of the current page for visual inspection.

-    This tool captures what's visually displayed in the browser. When the
-    active model supports native vision, the screenshot is attached directly
-    to the conversation so the model can inspect it on the next turn.
-    Otherwise Hermes falls back to the auxiliary vision model. Useful for
-    understanding visual content that the text-based snapshot may not capture
-    (CAPTCHAs, verification challenges, images, complex layouts, etc.).
+    Captures what's visually displayed in the browser. When the active model
+    supports native vision, the screenshot is attached directly to the
+    conversation so the model can inspect it on the next turn; otherwise Hermes
+    falls back to the auxiliary vision model and returns a text analysis. Useful
+    for visual content the text-based snapshot may not capture (CAPTCHAs,
+    verification challenges, images, complex layouts, etc.).

    The screenshot is saved persistently and its file path is returned so it
    can be shared with users via MEDIA:<path> in the response.
@ -3064,8 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        task_id: Task identifier for session isolation

    Returns:
-        Either a JSON string with vision analysis results and screenshot_path,
-        or a multimodal tool-result envelope with the screenshot and metadata.
+        A JSON string with vision analysis results and screenshot_path, or a
+        multimodal tool-result envelope carrying the screenshot and metadata.
    """
    if _is_camofox_mode():
        from tools.browser_camofox import camofox_vision
@ -3190,55 +3190,33 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
        data_url = f"data:image/png;base64,{_screenshot_b64}"

-        # Fast path: when the active main model supports native vision AND the
-        # provider supports image content inside tool results, short-circuit
-        # the auxiliary LLM and return the image bytes as a multimodal
-        # tool-result envelope. The user can force native vision with the 
-        # supports_vision override. The main model sees the pixels directly on its
-        # next turn — no aux call, no information loss, no extra latency.
-        try:
-            from agent.auxiliary_client import _read_main_model, _read_main_provider
-            from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
-            from hermes_cli.config import load_config
-            from tools.vision_tools import (
-                _build_native_vision_tool_result,
-                _supports_media_in_tool_results,
-            )
+        # Fast path: when native image routing is in effect for the active main
+        # model, attach the screenshot directly instead of describing it through
+        # an auxiliary vision LLM. The model inspects the pixels on its next
+        # turn — no aux call, no information loss. Consistent with vision_analyze.
+        from tools.vision_tools import (
+            _build_native_vision_tool_result,
+            _should_use_native_vision_fast_path,
+        )

-            _provider = _read_main_provider()
-            _model = _read_main_model()
-            _cfg = load_config()
-            _mode = decide_image_input_mode(_provider, _model, _cfg)
-            _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
-            if _mode == "native" and (
-                _supports_media_in_tool_results(_provider, _model)
-                or _supports_vision
-            ):
-                native_result = _build_native_vision_tool_result(
-                    image_url=str(screenshot_path),
-                    question=question,
-                    image_data_url=data_url,
-                    image_size_bytes=len(_screenshot_bytes),
-                )
-                native_result.setdefault("meta", {})
-                native_result["meta"]["screenshot_path"] = str(screenshot_path)
-                if _lp_fallback_warning:
-                    native_result["meta"]["fallback_warning"] = _lp_fallback_warning
-                if annotate and result.get("data", {}).get("annotations"):
-                    native_result["meta"]["annotations"] = result["data"]["annotations"]
-                text_parts = native_result.get("content") or []
-                if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text":
-                    text_parts[0]["text"] = (
-                        str(text_parts[0].get("text", ""))
-                        + f"\n\nScreenshot path: {screenshot_path}"
-                    )
-                native_result["text_summary"] = (
-                    str(native_result.get("text_summary") or "")
-                    + f" Screenshot path: {screenshot_path}"
-                ).strip()
-                return native_result
-        except Exception:
-            pass
+        if _should_use_native_vision_fast_path():
+            native_result = _build_native_vision_tool_result(
+                image_url=str(screenshot_path),
+                question=question,
+                image_data_url=data_url,
+                image_size_bytes=len(_screenshot_bytes),
+            )
+            meta = native_result.setdefault("meta", {})
+            meta["screenshot_path"] = str(screenshot_path)
+            if _lp_fallback_warning:
+                meta["fallback_warning"] = _lp_fallback_warning
+            if annotate and result.get("data", {}).get("annotations"):
+                meta["annotations"] = result["data"]["annotations"]
+            native_result["text_summary"] = (
+                f"{native_result.get('text_summary', '')} "
+                f"Screenshot path: {screenshot_path}"
+            ).strip()
+            return native_result

        vision_prompt = (
            f"You are analyzing a screenshot of a web browser.\n\n"
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -476,6 +476,36 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
    return False


+def _should_use_native_vision_fast_path() -> bool:
+    """Whether vision tools should attach the image to the main model directly
+    instead of routing through the auxiliary vision LLM.
+
+    True when image routing resolves to ``native`` AND either the provider is
+    known to accept images inside tool results, or the user explicitly declared
+    the model vision-capable via the ``model.supports_vision`` config override.
+    The override is the escape hatch for custom/local providers that aren't in
+    the static allowlist. Best-effort: any resolution failure returns False so
+    the caller falls back to the legacy aux-LLM path.
+    """
+    try:
+        from agent.auxiliary_client import _read_main_provider, _read_main_model
+        from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
+        from hermes_cli.config import load_config
+
+        provider = _read_main_provider()
+        model = _read_main_model()
+        cfg = load_config()
+        if decide_image_input_mode(provider, model, cfg) != "native":
+            return False
+        return (
+            _supports_media_in_tool_results(provider, model)
+            or _lookup_supports_vision(provider, model, cfg) is True
+        )
+    except Exception as exc:
+        logger.debug("Native vision fast-path check failed: %s", exc)
+        return False
+
+
 def _build_native_vision_tool_result(
    image_url: str,
    question: str,
@ -1030,33 +1060,15 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
    image_url = args.get("image_url", "")
    question = args.get("question", "")

-    # Fast path: when the active main model supports native vision AND the
-    # provider supports image content inside tool results, short-circuit
-    # the auxiliary LLM and return the image bytes as a multimodal
-    # tool-result envelope. The user can force native vision with the 
-    # supports_vision override. The main model sees the pixels directly on its
-    # next turn — no aux call, no information loss, no extra latency.
-    try:
-        from agent.auxiliary_client import _read_main_provider, _read_main_model
-        from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
-        from hermes_cli.config import load_config
-
-        _provider = _read_main_provider()
-        _model = _read_main_model()
-        _cfg = load_config()
-        _mode = decide_image_input_mode(_provider, _model, _cfg)
-        _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
-        if _mode == "native" and (
-            _supports_media_in_tool_results(_provider, _model)
-            or _supports_vision
-        ):
-            logger.info(
-                "vision_analyze: native fast path (provider=%s, model=%s)",
-                _provider, _model,
-            )
-            return _vision_analyze_native(image_url, question)
-    except Exception as exc:
-        logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc)
+    # Fast path: when native image routing is in effect for the active main
+    # model (provider accepts images in tool results, or the user set the
+    # model.supports_vision override), short-circuit the auxiliary LLM and
+    # return the image bytes as a multimodal tool-result envelope. The main
+    # model sees the pixels directly on its next turn — no aux call, no
+    # information loss, no extra latency.
+    if _should_use_native_vision_fast_path():
+        logger.info("vision_analyze: native fast path")
+        return _vision_analyze_native(image_url, question)

    # Legacy path: aux LLM describes the image and we return its text.
    full_prompt = (